# List all loaded packages
loaded_packages <- names(sessionInfo()$otherPkgs)

# Check for 'rename' function in each package
sapply(loaded_packages, function(pkg) {
  "rename" %in% ls(getNamespace(pkg))
})
##                   DT                  car              carData 
##                FALSE                FALSE                FALSE 
##             pheatmap         ComplexUpset               UpSetR 
##                FALSE                FALSE                FALSE 
##          VennDiagram        futile.logger                Cairo 
##                FALSE                FALSE                FALSE 
##                  egg            gridExtra      EnhancedVolcano 
##                FALSE                FALSE                FALSE 
##              ggrepel               DESeq2 SummarizedExperiment 
##                FALSE                FALSE                FALSE 
##              Biobase       MatrixGenerics          matrixStats 
##                FALSE                FALSE                FALSE 
##                edgeR                limma          rtracklayer 
##                FALSE                FALSE                FALSE 
##        GenomicRanges         GenomeInfoDb              IRanges 
##                FALSE                FALSE                FALSE 
##            S4Vectors         BiocGenerics            lubridate 
##                 TRUE                FALSE                FALSE 
##              forcats              stringr                dplyr 
##                FALSE                FALSE                 TRUE 
##                purrr                readr                tidyr 
##                FALSE                FALSE                FALSE 
##               tibble              ggplot2            tidyverse 
##                FALSE                 TRUE                FALSE
# define filepaths to mapped counts files for all EV and simulated donor reads
sorted_p1_count_path      <- "./input_files/MappCountFlow/output/p1_sort_counts.txt"
sorted_p1_sim1_count_path <- "./input_files/MappCountFlow/output/p1_sim1_sort_counts.txt"
sorted_p1_sim2_count_path <- "./input_files/MappCountFlow/output/p1_sim2_sort_counts.txt"
sorted_p1_sim3_count_path <- "./input_files/MappCountFlow/output/p1_sim3_sort_counts.txt"
sorted_wt_count_path      <- "./input_files/MappCountFlow/output/wt_sort_counts.txt"
sorted_wt_sim1_count_path <- "./input_files/MappCountFlow/output/wt_sim1_sort_counts.txt"
sorted_wt_sim2_count_path <- "./input_files/MappCountFlow/output/wt_sim2_sort_counts.txt"
sorted_wt_sim3_count_path <- "./input_files/MappCountFlow/output/wt_sim3_sort_counts.txt"

# define filepath to gff3 file from re-annotated reference genome
reannotated_gff_path <-"./input_files/Reannotated_ASF/ref_06.gff"

# define filepath to functional annotation file from eggnog mapper (.annotations file)
emapper_path <- "./input_files/Reannotated_ASF/genome_annotation.emapper.annotations"

# define filepath to metadata for study design
metadata_path <- "./input_files/EV_metadata.csv"
  # Should be a .csv file with that looks like the following:
      # +----------+------------+-------+
      # |    id    | simulation | group |
      # +----------+------------+-------+
      # |    p1    |    exp     |   p1  |
      # | p1_sim1  |    sim     |   p1  |
      # | p1_sim2  |    sim     |   p1  |
      # | p1_sim3  |    sim     |   p1  |
      # |    p2    |    exp     |   p2  | # present for benchmarking (poor quality data)
      # | p2_sim1  |    sim     |   p2  | # present for benchmarking (poor quality data)
      # | p2_sim2  |    sim     |   p2  | # present for benchmarking (poor quality data)
      # | p2_sim3  |    sim     |   p2  | # present for benchmarking (poor quality data)
      # |    wt    |    exp     |   wt  |
      # | wt_sim1  |    sim     |   wt  |
      # | wt_sim2  |    sim     |   wt  |
      # | wt_sim3  |    sim     |   wt  |
      # +----------+------------+-------+

# define filepath to list_of_operons file from operon mapper
mapped_operon_path <- "./input_files/operon_mapper_reannotated_gtf/1779193/list_of_operons_1779193"
# combining the three count data frames into a single data frame, and assigns column names
raw_sorted_counts_df <- cbind((read.table(sorted_p1_count_path, header = FALSE, sep = "\t", stringsAsFactors = FALSE))$V1, # only for row names
                      
                    (read.table(sorted_p1_count_path, header = FALSE, sep = "\t", stringsAsFactors = FALSE))$V2, # p1
                    
                    (read.table(sorted_p1_sim1_count_path, header = FALSE, sep = "\t", stringsAsFactors = FALSE))$V2, 
                    (read.table(sorted_p1_sim2_count_path, header = FALSE, sep = "\t", stringsAsFactors = FALSE))$V2,
                    (read.table(sorted_p1_sim3_count_path, header = FALSE, sep = "\t", stringsAsFactors = FALSE))$V2,
                    
                    (read.table(sorted_wt_count_path, header = FALSE, sep = "\t", stringsAsFactors = FALSE))$V2, # wt
                    
                    (read.table(sorted_wt_sim1_count_path, header = FALSE, sep = "\t", stringsAsFactors = FALSE))$V2, 
                    (read.table(sorted_wt_sim2_count_path, header = FALSE, sep = "\t", stringsAsFactors = FALSE))$V2,
                    (read.table(sorted_wt_sim3_count_path, header = FALSE, sep = "\t", stringsAsFactors = FALSE))$V2
                    )

colnames(raw_sorted_counts_df) <- c("locus_tag", 
                          "p1",
                          "p1_sim1","p1_sim2","p1_sim3",
                          "wt",
                          "wt_sim1","wt_sim2","wt_sim3")

# Preprocessing raw counts data frame
raw_counts_df_no_annotation <- raw_sorted_counts_df %>%
  as.data.frame() %>%
  remove_rownames() %>% 
  mutate_at(vars(2:9), as.integer) %>%
  mutate(locus_tag = str_replace(locus_tag, "locus_tag-", "")) %>% 
  column_to_rownames(var="locus_tag")

# Filter out rows with zero counts across all samples
non_empty_locus_tags <- rowSums(raw_counts_df_no_annotation) >= 1
filtered_raw_counts_df_no_annotation <- raw_counts_df_no_annotation[non_empty_locus_tags,]
cat("Changes from raw counts to filtered counts: ", nrow(raw_counts_df_no_annotation), "genes to ", nrow(filtered_raw_counts_df_no_annotation), "; ",(nrow(raw_counts_df_no_annotation) - nrow(filtered_raw_counts_df_no_annotation)) , "zero-count genes filtered", "\n")
## Changes from raw counts to filtered counts:  5607 genes to  5391 ;  216 zero-count genes filtered
# Read and process annotation metadata from GFF file
annotation_metadata <- readGFF(reannotated_gff_path) %>%
  as.data.frame() %>%
  select(locus_tag, type, start, end, strand, product) %>%
  mutate(gene_length = end - start)  # Inclusive of both start and end positions

# We joined counts and annotation from gff3 file from the ASF519 reference genome
  # 5391 - 5,387 = 4 locus_tags removed (incomplete annotation locus_tags)
# Join counts with annotation metadata
annotated_counts_df <- filtered_raw_counts_df_no_annotation %>%
  rownames_to_column("locus_tag") %>%
  inner_join(annotation_metadata, by = "locus_tag")
cat("Changes from filtered counts to annotated counts: ", nrow(filtered_raw_counts_df_no_annotation), "genes to ", nrow(annotated_counts_df), "; ",nrow(filtered_raw_counts_df_no_annotation) - nrow(annotated_counts_df) , " genes filtered from incomplete annotation", "\n")
## Changes from filtered counts to annotated counts:  5391 genes to  5387 ;  4  genes filtered from incomplete annotation
# Separate annotation and count data into two data frames
annotation_df <- annotated_counts_df %>%
  select(locus_tag, start, end, gene_length, strand, product)

raw_counts_df <- annotated_counts_df %>%
  select(locus_tag, starts_with("p1"), starts_with("wt"))

raw_counts_mat <- raw_counts_df %>%
  column_to_rownames(var = "locus_tag") %>%
  as.matrix()  # Convert to matrix for downstream analysis

# Optionally, display the tables using DT for interactive exploration in R Shiny or R Markdown
# DT::datatable(annotation_df, rownames = FALSE, options = list(pageLength = 5))
# DT::datatable(raw_counts_df, rownames = FALSE, options = list(pageLength = 5))

# Output: Return a list containing the processed data frames/matrices
# list(
#   annotation_df = annotation_df,
#   raw_counts_df = raw_counts_df
# )
# Define a function to read eggNOG annotations with proper formatting and handling
read_eggnog_annotations <- function(filepath) {
  # Define column names for the eggNOG annotation file
  col_names <- c("query", "seed_ortholog", "evalue", "score", "eggNOG_OGs", 
                 "max_annot_lvl", "COG_category", "Description", "Preferred_name", 
                 "GOs", "EC", "KEGG_ko", "KEGG_Pathway", "KEGG_Module", "KEGG_Reaction", 
                 "KEGG_rclass", "BRITE", "KEGG_TC", "CAZy", "BiGG_Reaction", "PFAMs")
  
  # Define the expected column types
  col_types <- cols(
    query = col_character(),
    seed_ortholog = col_character(),
    evalue = col_double(),
    score = col_double(),
    eggNOG_OGs = col_character(),
    max_annot_lvl = col_character(),
    COG_category = col_character(),
    Description = col_character(),
    Preferred_name = col_character(),
    GOs = col_character(),
    EC = col_character(),
    KEGG_ko = col_character(),
    KEGG_Pathway = col_character(),
    KEGG_Module = col_character(),
    KEGG_Reaction = col_character(),
    KEGG_rclass = col_character(),
    BRITE = col_character(),
    KEGG_TC = col_character(),
    CAZy = col_character(),
    BiGG_Reaction = col_character(),
    PFAMs = col_character()
  )
  
  # Read the file using read_tsv and explicitly set column types
  annotations <- read_tsv(filepath, skip = 5, col_names = col_names, col_types = col_types) %>%
    filter(!str_starts(query, "##")) %>%
    mutate(across(where(is.character), ~na_if(., "-"))) %>%
    mutate(across(where(is.character), ~na_if(., "")))
  
  return(annotations)
}

# Read eggNOG annotations using the defined function
eggnog_annotations <- read_eggnog_annotations(emapper_path)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
# Count the number of NA values for each column
na_count <- eggnog_annotations %>%
  summarise(across(everything(), ~sum(is.na(.)), .names = "na_count_{.col}"))

# Subset the eggNOG annotations to relevant columns
eggnog_meta_subset <- eggnog_annotations %>%
  select(query, seed_ortholog, evalue, score, eggNOG_OGs, max_annot_lvl, COG_category, Description, Preferred_name, PFAMs) %>%
    dplyr::rename(locus_tag = query)

# Join the counts with annotation data and the eggNOG metadata
annotation_df <- annotated_counts_df %>%
  select(locus_tag, start, end, gene_length, strand, product) %>%
  left_join(eggnog_meta_subset, by = "locus_tag")

# Optionally, display the tables using DT for interactive exploration
# DT::datatable(eggnog_annotations, options = list(pageLength = 5), rownames = FALSE)
# DT::datatable(eggnog_meta_subset, options = list(pageLength = 5), rownames = FALSE)
# DT::datatable(full_metadata, options = list(pageLength = 5), rownames = FALSE)

# Optionally, display the tables using DT for interactive exploration in R Shiny or R Markdown
# DT::datatable(annotation_df, rownames = FALSE, options = list(pageLength = 5))
# Read metadata, ensuring that the 'id' column is set as row names
tryCatch({
  meta_df <- read.csv(metadata_path, row.names = 1)
}, error = function(e) {
  stop("Error reading metadata file: ", e$message)
})

# Remove unwanted rows by index - Rows 5, 6, 7, and 8 are removed
# It is assumed that there is a clear reason these specific rows are removed
meta_df <- meta_df[-c(5, 6, 7, 8), ]

# Convert character variables to factors for all columns where it is applicable
meta_df[] <- lapply(meta_df, function(col) if(is.character(col)) as.factor(col) else col)

# Assuming 'raw_counts_mat' is already defined in your environment
# Check that sample names match between metadata and counts data
# It's important for downstream analysis that these names match exactly
if (!all(colnames(raw_counts_mat) %in% rownames(meta_df))) {
  stop("Not all sample names from counts data are present in metadata.")
}

if (!all(colnames(raw_counts_mat) == rownames(meta_df))) {
  warning("Sample names in counts data and metadata do not match in order.")
  # Optional: Reorder the rows of meta_df to match the order of raw_counts_mat columns
  meta_df <- meta_df[match(colnames(raw_counts_mat), rownames(meta_df)), ]
}

# Return or proceed with the cleaned and checked metadata frame
meta_df
##         simulation group
## p1             exp    p1
## p1_sim1        sim    p1
## p1_sim2        sim    p1
## p1_sim3        sim    p1
## wt             exp    wt
## wt_sim1        sim    wt
## wt_sim2        sim    wt
## wt_sim3        sim    wt
# Define the groups for the analysis
group <- as.factor(c("exp",rep("sim", 3),"exp", rep("sim", 3)))

# Calculate Reads Per Kilobase (RPK)
rpk <- ( (raw_counts_mat * 10^3) / annotation_df$gene_length)

# Create a DGEList object
dge_obj_rpk <- edgeR::DGEList(counts = rpk, 
                            lib.size = colSums(rpk), 
                            samples = dplyr::select(meta_df,-group),
                            group = group,
                            locus_tags = annotation_df, 
                            remove.zeros = TRUE)

# Keep a copy of the original DGEList object before filtering
dge_obj_rpk_unfilt <- dge_obj_rpk

# Pre-filtering: Keep genes with at least 100 counts (that is 100 RPKM) in at least 4 samples
# This threshold is chosen to ensure a reasonable abundance level across samples
keep <- rowSums(cpm(dge_obj_rpk) > 100) >= 4
dge_obj_rpk <- dge_obj_rpk[keep, ]

# Report the number of genes retained after pre-filtering
cat("Number of locus_tags removed: ", nrow(rpk) - sum(rowSums(cpm(dge_obj_rpk) > 100) >= 4), "\n")
## Number of locus_tags removed:  950
cat("Number of genes retained after pre-filtering: ", sum(keep), "\n")
## Number of genes retained after pre-filtering:  4437
# Normalize using TMM
dge_obj_tmm_rpk <- dge_obj_rpk %>% 
  calcNormFactors(method="TMM", doWeighting=TRUE)

dge_obj_tmm_rpk_unfilt <- dge_obj_rpk_unfilt %>% 
  calcNormFactors(method="TMM", doWeighting=TRUE)

# Convert normalized counts to GeTMM (for DESeq2)
getmm <- cpm(dge_obj_tmm_rpk, normalized.lib.sizes = TRUE)
getmm_unfilt <- cpm(dge_obj_tmm_rpk_unfilt, normalized.lib.sizes = TRUE)

# Convert normalized counts to log2 CPM values (for Limma)
log2getmm <- cpm(dge_obj_tmm_rpk, log=TRUE, prior.count=1, normalized.lib.sizes = TRUE)
log2getmm_unfilt <- cpm(dge_obj_tmm_rpk_unfilt, log=TRUE, prior.count=1, normalized.lib.sizes = TRUE)

# Create data frames from the normalized log2 GeTMM and GeTMM matrices
log2getmm_df <- as.data.frame(log2getmm) %>%
  rownames_to_column(var = "Gene")

# Convert the data frame to a long format for visualization or further analysis]
log2getmm_long_df <- log2getmm_df %>% 
                 gather(Sample, Value, -Gene) %>%
                 mutate(Type = ifelse(Sample %in% c("p1", "p2", "wt"), "Empirical", "Simulated"))
# Define a color scheme for the samples
color_scheme <- c(
  "#8B005B", # "P1"
  "#DD0089", # "P1_sim1"
  "#FF20AD", # "P1_sim2"
  "#FF80E0", # "P1_sim3"
  
  "#007D75", # "WT"
  "#00BBBB", # "WT_sim1"
  "#00E4E4", # "WT_sim2"
  "#66FFFF"  # "WT_sim3"
)

# Function to determine line types based on the sample name
determine_linetype <- function(Sample) {
  if (grepl("_sim", Sample)) {
    return("dashed")
  } else {
    return("solid")
  }
}
# Apply the linetype function to the unique samples in the dataframe
linetypes <- sapply(unique(log2getmm_long_df$Sample), determine_linetype)

# Plot settings
box_title_size <- 6

# Create a violin plot with corresponding boxplot and jitter
violin_plot <- ggplot(log2getmm_long_df, aes(x = Sample, y = Value, color = Sample)) + 
  geom_violin(width = 1.5) +
  geom_boxplot(outlier.shape = NA) +  # Hide outliers
  geom_jitter(position = position_jitter(width = 0.15), size = 0.05, alpha = 0.2) +
  scale_color_manual(values = color_scheme) +
  theme_minimal() + 
  labs(title = "Violin: log2(GeTMM) normalized", y = "log2(GeTMM) genes", tag = "B-1") +
  theme(legend.title = element_blank(), plot.title = element_text(size = box_title_size),
        axis.ticks.x = element_blank(), axis.text.x = element_blank()) + 
  egg::theme_article(base_size = box_title_size)

# Create a density plot
density_plot <- ggplot(log2getmm_long_df, aes(x = Value, color = Sample, linetype = Sample)) + 
  geom_density(alpha = 0.2, linewidth = 0.15) + 
  coord_cartesian(ylim = c(0, 1)) +
  scale_color_manual(values = color_scheme) +
  scale_linetype_manual(values = linetypes) +
  theme_minimal() + 
  labs(title = "Density: log2(GeTMM) normalized", x = "log2(GeTMM)", y = "Density", tag = "B-2") +
  theme(legend.title = element_blank(), plot.title = element_text(size = box_title_size)) + 
  egg::theme_article(base_size = box_title_size)

# Create an ECDF plot
ecdf_plot <- ggplot(log2getmm_long_df, aes(x = Value, color = Sample, linetype = Sample)) + 
  stat_ecdf(linewidth = 0.2, alpha = 0.5) +
  scale_color_manual(values = color_scheme) +
  scale_linetype_manual(values = linetypes) +
  theme_minimal() + 
  labs(title = "ECDF: log2(GeTMM) normalized", x = "log2(GeTMM)", y = "Cumulative Proportion", tag = "B-3") +
  theme(legend.title = element_blank(), plot.title = element_text(size = box_title_size)) + 
  egg::theme_article(base_size = box_title_size)

# Arrange the plots into a single object
normalization <- arrangeGrob(violin_plot, density_plot, ecdf_plot, ncol = 3, nrow = 2)
## Warning: `position_dodge()` requires non-overlapping x intervals
# Draw the plots on the current graphic device
grid.draw(normalization)

# Save the plots to files
ggsave("./Output_figures_n_plots/fig1.1_normalization.svg", plot=normalization, device=CairoSVG, width = 9, height = 4.5) # height 5.5 for 3*4 plot
ggsave("./Output_figures_n_plots/fig1.1_normalization.pdf", plot=normalization, device=CairoPDF, width = 9, height = 4.5)
ggsave("./Output_figures_n_plots/fig1.1_normalization.png", plot=normalization, width = 9, height = 4.5, dpi=400)
# Define empirical and simulated column names
empirical_cols <- c("p1", "wt")
simulated_cols <- c("p1_sim1", "p1_sim2", "p1_sim3", "wt_sim1", "wt_sim2", "wt_sim3")

# Define a function to perform Levene's Test on a given row
calculate_levene_pval <- function(row) {
  group_factor <- factor(rep(c("empirical", "simulated"), 
                             times = c(length(empirical_cols), length(simulated_cols))))
  levenes_res <- leveneTest(y = c(row[empirical_cols], row[simulated_cols]), group = group_factor)
  levenes_res[1, "Pr(>F)"]
}

# Define a function to perform Bartlett's Test on a given row
calculate_bartlett_pval <- function(row) {
  bartlett_res <- bartlett.test(list(row[empirical_cols], row[simulated_cols]))
  bartlett_res$p.value
}

# Apply the defined functions to calculate p-values
levene_pvals <- apply(log2getmm, 1, calculate_levene_pval)
bartlett_pvals <- apply(log2getmm, 1, calculate_bartlett_pval)

# Combine the p-values into a data frame for visualization
heteroscedasticity_pvals_df <- data.frame(
  method = rep(c("Levene", "Bartlett"), each = length(levene_pvals)),
  p_value = c(levene_pvals, bartlett_pvals)
)

# Plot histograms to compare p-values from Levene's and Bartlett's tests
heteroscedasticity_test_plot <- ggplot(heteroscedasticity_pvals_df, aes(x = p_value, fill = method)) +
  geom_histogram(binwidth = 0.02, position = "identity", alpha = 0.5) +
  scale_fill_manual(values = c("Levene" = "#0072B2", "Bartlett" = "#D55E00")) +
  labs(
    title = "P-value Histograms from Levene's and Bartlett's Tests",
    x = "P-value",
    y = "Frequency"
  ) +
  theme_minimal() +
  egg::theme_article(base_size = 10)

grid.draw(heteroscedasticity_test_plot)

ggsave("./Output_figures_n_plots/fig1.2_log2getmm_heteroscedasticity.pdf", plot=heteroscedasticity_test_plot, device=CairoPDF, width = 6, height = 4)
ggsave("./Output_figures_n_plots/fig1.2_log2getmm_heteroscedasticity.svg", plot=heteroscedasticity_test_plot, device=CairoSVG, width = 6, height = 4)
ggsave("./Output_figures_n_plots/fig1.2_log2getmm_heteroscedasticity.png", plot=heteroscedasticity_test_plot, width = 6, height = 4)
# Create a list of vector pairs

log2getmm_vector_pairs <- list(
  list(log2getmm_df$p1, log2getmm_df$p1_sim1),  list(log2getmm_df$p1, log2getmm_df$p1_sim2),  list(log2getmm_df$p1, log2getmm_df$p1_sim3),
  list(log2getmm_df$wt, log2getmm_df$wt_sim1),  list(log2getmm_df$wt, log2getmm_df$wt_sim2),  list(log2getmm_df$wt, log2getmm_df$wt_sim3))

pair_names <- c("p1 vs p1_sim1", "p1 vs p1_sim2", "p1 vs p1_sim3",
                "wt vs wt_sim1", "wt vs wt_sim2", "wt vs wt_sim3")

# Mann-Whitney U test (Wilcoxon signed-rank test) # independent sample assumption
run_mannwhitney_test <- function(pair, pair_name) {
  test_result <- stats::wilcox.test(pair[[1]], pair[[2]], paired = FALSE)
  tibble::tibble(Samples = pair_name, Mann_Whitney_U_Stat = test_result$statistic, p_value = test_result$p.value)
}

# Kruskal-Wallis test (this is for more than two groups)
run_kruskal_test <- function(pair, pair_name) {
  test_result <- stats::kruskal.test(list(pair[[1]], pair[[2]])) # Extend the list for more groups
  tibble::tibble(Samples = pair_name, Kruskal_Wallis_Stat = test_result$statistic, p_value = test_result$p.value)
}

# Kolmogorov–Smirnov test
run_ks_test <- function(pair, pair_name) {
  set.seed(519)
  test_result <- stats::ks.test(pair[[1]], pair[[2]], alternative = "two.sided", simulate.p.value = TRUE, exact = TRUE , B=as.integer(length(pair[[1]])))
  tibble::tibble(Samples = pair_name, Kolmogorov_Smirnov_D_Stat = test_result$statistic, p_value = test_result$p.value)
}
# Cramér-von Mises test
run_cvm_test <- function(pair, pair_name) {
  set.seed(519)
  test_result <- twosamples::cvm_test(pair[[1]], pair[[2]], nboots = length(pair[[1]]))
  tibble::tibble(Samples = pair_name, Cramér_von_Mises_Stat = test_result[[1]], p_value = test_result[[2]])
}

# Apply the tests
log2getmm_mannwhitney_result <- map2_dfr(log2getmm_vector_pairs, pair_names, run_mannwhitney_test) %>% mutate(p_value_rounded = round(p_value, 4))
log2getmm_mannwhitney_result %>% as.matrix()
##      Samples         Mann_Whitney_U_Stat p_value      p_value_rounded
## [1,] "p1 vs p1_sim1" "9592384"           "0.03743757" "0.0374"       
## [2,] "p1 vs p1_sim2" "9585526"           "0.03253340" "0.0325"       
## [3,] "p1 vs p1_sim3" "9615091"           "0.05838732" "0.0584"       
## [4,] "wt vs wt_sim1" "9799139"           "0.71324335" "0.7132"       
## [5,] "wt vs wt_sim2" "9828775"           "0.90297900" "0.9030"       
## [6,] "wt vs wt_sim3" "9830541"           "0.91457994" "0.9146"
log2getmm_kruskal_result <- map2_dfr(log2getmm_vector_pairs, pair_names, run_kruskal_test) %>% mutate(p_value_rounded = round(p_value, 4))
log2getmm_kruskal_result %>%  as.matrix()
##      Samples         Kruskal_Wallis_Stat p_value      p_value_rounded
## [1,] "p1 vs p1_sim1" "4.33041173"        "0.03743719" "0.0374"       
## [2,] "p1 vs p1_sim2" "4.57018437"        "0.03253306" "0.0325"       
## [3,] "p1 vs p1_sim3" "3.58262634"        "0.05838677" "0.0584"       
## [4,] "wt vs wt_sim1" "0.13506199"        "0.71324026" "0.7132"       
## [5,] "wt vs wt_sim2" "0.01486038"        "0.90297572" "0.9030"       
## [6,] "wt vs wt_sim3" "0.01150635"        "0.91457665" "0.9146"
log2getmm_ks_result <- map2_dfr(log2getmm_vector_pairs, pair_names, run_ks_test)%>% mutate(p_value_rounded = round(p_value, 4))
log2getmm_ks_result %>% as.matrix()
##      Samples         Kolmogorov_Smirnov_D_Stat p_value        p_value_rounded
## [1,] "p1 vs p1_sim1" "0.09713771"              "0.0002253267" "2e-04"        
## [2,] "p1 vs p1_sim2" "0.09556006"              "0.0002253267" "2e-04"        
## [3,] "p1 vs p1_sim3" "0.08789723"              "0.0002253267" "2e-04"        
## [4,] "wt vs wt_sim1" "0.11133649"              "0.0002253267" "2e-04"        
## [5,] "wt vs wt_sim2" "0.10908271"              "0.0002253267" "2e-04"        
## [6,] "wt vs wt_sim3" "0.11066036"              "0.0002253267" "2e-04"
log2getmm_cvm_result <- map2_dfr(log2getmm_vector_pairs, pair_names, run_cvm_test) %>% mutate(p_value_rounded = round(p_value, 4))
log2getmm_cvm_result %>% as.matrix()
##      Samples         Cramér_von_Mises_Stat p_value        p_value_rounded
## [1,] "p1 vs p1_sim1" "30.99537"            "0.0001126888" "1e-04"        
## [2,] "p1 vs p1_sim2" "32.20118"            "0.0001126888" "1e-04"        
## [3,] "p1 vs p1_sim3" "27.22047"            "0.0001126888" "1e-04"        
## [4,] "wt vs wt_sim1" "37.84036"            "0.0001126888" "1e-04"        
## [5,] "wt vs wt_sim2" "36.38822"            "0.0001126888" "1e-04"        
## [6,] "wt vs wt_sim3" "34.46313"            "0.0001126888" "1e-04"
# Creating the initial dataframe using mannwhitney_result and setting "Samples" as rownames
log2getmm_dist_test_res <- log2getmm_mannwhitney_result %>%
  mutate(across(everything(), as.character)) %>% 
  left_join(log2getmm_kruskal_result, by = "Samples") %>%
  left_join(log2getmm_ks_result, by = "Samples") %>% 
  left_join(log2getmm_cvm_result, by = "Samples")

# Print the resulting dataframe
write.csv(log2getmm_dist_test_res, file = "./Output_figures_n_plots/table1_log2getmm_dist_test_res.csv", row.names = TRUE)

qq_title_size <- 6.5
# Design matrix
design <- model.matrix(~ 0 + group) # Explicitly exclude intercept to get coefficients for each group
colnames(design) <- levels(group)

# Fit linear model using limma
fit <- lmFit(log2getmm, design)

# Define contrasts for differential expression
contrast.matrix <- makeContrasts(exp_vs_sim = exp - sim, levels=design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2, trend=TRUE)

# Extract results
res_limma <- topTable(fit2, coef="exp_vs_sim", n=Inf)
# Create a DESeqDataSet from the matrix of GeTMM normalized counts
colData <- data.frame(row.names=colnames(getmm), group=group)
# rounded the GeTMM normalized counts to integers since DESeq2 requires integer counts
dds_deseq <- DESeqDataSetFromMatrix(countData = round(getmm), colData = colData, design = ~ group)
## converting counts to integer mode
# Run the DESeq function
dds_deseq <- DESeq(dds_deseq,fitType='local')
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
plotDispEsts(dds_deseq, main= "dispEst: local")

# Extract results
res_deseq <- results(dds_deseq, contrast=c("group", "exp", "sim"))
res_deseq$baseMean <- log2(res_deseq$baseMean)
res_deseq <- res_deseq %>% as.data.frame
# Create a DESeqDataSet from the matrix of GeTMM normalized counts
colData_full <- data.frame(row.names=colnames(getmm_unfilt), group=group)
# rounded the GeTMM normalized counts to integers since DESeq2 requires integer counts
dds_deseq_full <- DESeqDataSetFromMatrix(countData = round(getmm_unfilt), colData = colData, design = ~ group)
## converting counts to integer mode
# Run the DESeq function
dds_deseq_full <- DESeq(dds_deseq_full,fitType='local')
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
plotDispEsts(dds_deseq_full, main= "dispEst: local, full dataset")

# Extract results
res_deseq_full <- results(dds_deseq_full, contrast=c("group", "exp", "sim"))
res_deseq_full$baseMean <- log2(res_deseq_full$baseMean)
res_deseq_full <- res_deseq_full %>% as.data.frame
# Function to Create Overlay QQ Plot
create_overlay_qq_plot <- function(data_df, empirical_col, simulated_cols, title, xlab, ylab, legend_labels) {
  # Initialize plot
  p <- ggplot() +
       geom_abline(intercept = 0, slope = 1, color = "red") +
       coord_cartesian(xlim = c(2.5, 12), ylim = c(2.5, 12)) +
       labs(title = title, x = xlab, y = ylab, color = "Comparison set") +
       theme(legend.title = element_text(size = 10))
  
  colors <- c("orange", "blue", "green")
  
  for (i in 1:length(simulated_cols)) {
    sim_col <- simulated_cols[i]
    
    quantiles <- data.frame(
      theoretical = sort(data_df[[sim_col]]),
      sample = sort(data_df[[empirical_col]]),
      label = rep(legend_labels[i], length(data_df[[empirical_col]]))
    )
    
    p <- p + geom_point(data = quantiles, aes(x = theoretical, y = sample, color = label), alpha = 0.3) 
  }
  
  p <- p + scale_color_manual(name = "Comparison set", values = setNames(colors, legend_labels)) +
    theme(legend.title=element_blank(), plot.title = element_text(size = qq_title_size)) +
    egg::theme_article(base_size = qq_title_size)
  
  return(p)
}

# Legend labels
p1_legend_labels <- c("P1 vs P1_sim1", "P1 vs P1_sim2", "P1 vs P1_sim3")
wt_legend_labels <- c("WT vs WT_sim1", "WT vs WT_sim2", "WT vs WT_sim3")

# Create Overlay QQ Plots
p1_overlay <- create_overlay_qq_plot(log2getmm_df, "p1", c("p1_sim1", "p1_sim2", "p1_sim3"), 
                                     "Overlay QQ Plot: p1 vs. Simulated", 
                                     "Theoretical Quantiles", "Empirical Quantiles of p1", p1_legend_labels)

wt_overlay <- create_overlay_qq_plot(log2getmm_df, "wt", c("wt_sim1", "wt_sim2", "wt_sim3"), 
                                     "Overlay QQ Plot: wt vs. Simulated", 
                                     "Theoretical Quantiles", "Empirical Quantiles of wt", wt_legend_labels)

# Arrange the plots in a grid
qq_plot_grid <- grid.arrange(p1_overlay, wt_overlay, ncol = 2)

ggsave("./Output_figures_n_plots/fig1.3_log2getmm_qq_plots.svg", plot=qq_plot_grid, device=CairoSVG, width = 6, height = 2)
ggsave("./Output_figures_n_plots/fig1.3_log2getmm_qq_plots.pdf", plot=qq_plot_grid, device=CairoPDF, width = 6, height = 2)
ggsave("./Output_figures_n_plots/fig1.3_log2getmm_qq_plots.png", plot=qq_plot_grid, width = 6, height = 2, dpi=300)
# Prepare annotation data from DESeq2 object
# Rename columns and create a new column 'biotype' based on pattern matching in 'sample'
annotation_data <- as.data.frame(colData(dds_deseq)) %>%
  dplyr::rename(Seq_depth_factor = sizeFactor) %>%
  rownames_to_column(var = "sample") %>%
  mutate(biotype = dplyr::if_else(grepl("p1", sample), "P1", "WT")) %>%
  mutate(biotype = as.factor(biotype))

# Define color palette for annotation
annotation_colors <- list(
  biotype = c(
    "P1" = "#FF20AD",
    "WT" = "#00E4E4"
  )
)

# Define a color range using a color ramp
color_range <- colorRampPalette(c("blue", "white", "red"))(256)

# Create a heatmap of log2-transformed GeTMM data
heatmap_getmm <- pheatmap::pheatmap(
    mat = log2getmm,
    scale = "none",
    color = color_range,
    cluster_rows = TRUE,
    cluster_cols = TRUE,
    show_rownames = FALSE,
    show_colnames = TRUE,
    use_raster = TRUE,
    main = "log2(GeTMM)",
    annotation_col = annotation_data %>% dplyr::select(biotype),
    annotation_colors = annotation_colors,
    column_split = annotation_data$biotype
)

heatmap_getmm

ggsave("./Output_figures_n_plots/fig1.4_manuscript_heatmap.pdf", plot=heatmap_getmm, device=CairoPDF, width = 4, height = 6)
# Set thresholds for adjusted p-value and log fold change
p_threshold <- 0.05
lfc_threshold <- 1

# Annotate and rename columns of limma results
res_limma_annot <- res_limma %>%
  tibble::rownames_to_column("rowname") %>%
  left_join(annotation_df, by = c("rowname" = "locus_tag")) %>%
  tibble::column_to_rownames("rowname") %>%
  dplyr::rename_with(~ case_when(
    . == "logFC" ~ "log2FoldChange",
    . == "AveExpr" ~ "log2MeanGeTMM",
    . == "t" ~ "stat",
    . == "P.Value" ~ "pvalue",
    . == "adj.P.Val" ~ "padj",
    TRUE ~ .
  ))

# Annotate and rename columns of DESeq2 results
res_deseq_annot <- res_deseq %>%
  tibble::rownames_to_column("rowname") %>%
  left_join(annotation_df, by = c("rowname" = "locus_tag")) %>%
  tibble::column_to_rownames("rowname") %>%
  dplyr::rename_with(~ case_when(
    . == "baseMean" ~ "log2MeanGeTMM",
    TRUE ~ .
  ))

# Annotate and rename columns of full DESeq2 results
res_deseq_annot_full <- res_deseq_full %>%
  tibble::rownames_to_column("rowname") %>%
  left_join(annotation_df, by = c("rowname" = "locus_tag")) %>%
  tibble::column_to_rownames("rowname") %>%
  dplyr::rename_with(~ case_when(
    . == "baseMean" ~ "log2MeanGeTMM",
    TRUE ~ .
  ))

# Optionally, display the tables using DT for interactive exploration
# DT::datatable(res_limma_annot, rownames = FALSE)
# DT::datatable(res_deseq_annot, rownames = FALSE)
# DT::datatable(res_deseq_annot_full, rownames = FALSE)

# Calculate the number of significant results based on p-value and log fold change thresholds
n_sig_limma_p <- sum(res_limma_annot$padj < p_threshold, na.rm = TRUE)
n_sig_deseq_p <- sum(res_deseq_annot$padj < p_threshold, na.rm = TRUE)
n_sig_deseq_p_full <- sum(res_deseq_annot_full$padj < p_threshold, na.rm = TRUE)

n_sig_limma_p_lfc <- sum(res_limma_annot$padj < p_threshold & abs(res_limma_annot$log2FoldChange) > lfc_threshold, na.rm = TRUE)
n_sig_deseq_p_lfc <- sum(res_deseq_annot$padj < p_threshold & abs(res_deseq_annot$log2FoldChange) > lfc_threshold, na.rm = TRUE)
n_sig_deseq_p_lfc_full <- sum(res_deseq_annot_full$padj < p_threshold & abs(res_deseq_annot_full$log2FoldChange) > lfc_threshold, na.rm = TRUE)
# Plotting
ma_volc_title_size <- 7  # Set the title size

plot_ma_volc <- grid.arrange(

ggplot(res_deseq_annot, aes(x = log2MeanGeTMM, y = log2FoldChange)) +
  geom_point(aes(color = padj < 0.05), alpha = 0.5, size = 1) +
  scale_color_manual(values = c("TRUE" = "salmon", "FALSE" = "lightgrey"), name = "Threshold") +
  theme_minimal() +
  labs(title = "MA Plot: DESeq2",
       y = bquote(~Log[2]~ 'fold GeTMM change'),
       x = bquote(~Log[2]~ 'Mean(GeTMM)'),
       color = "Threshold", tag = "A") +
  geom_hline(yintercept = 0, color = "blue") +
  scale_x_continuous(trans = 'log2') +
  coord_cartesian(ylim = c(-3, 3)) +
  theme(legend.position = "right", 
        legend.title = element_blank(),
        plot.title = element_text(size = ma_volc_title_size)) + 
  egg::theme_article(base_size = ma_volc_title_size) + 
  annotate("text", x = Inf, y = -2.8, label = paste("Sig. genes:", n_sig_deseq_p), hjust = "right",
           size = ma_volc_title_size/2.5),

EnhancedVolcano(res_deseq_annot,
    lab = res_deseq_annot$Preferred_name,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.05,
    FCcutoff = 1,
    pointSize = 1,
    labSize = 2,
    colAlpha = 0.5,
    legendPosition = 'right',
    legendLabSize = 12,
    legendIconSize = 4.0,
    drawConnectors = TRUE,
    widthConnectors = 0.2,
    max.overlaps = Inf) + 
    theme_minimal() +
  coord_cartesian(xlim = c(-4, 4)) +
  labs(title = "Volcano: DESeq2",
       x = bquote(~Log[2]~ 'fold GeTMM change'),
       y = bquote(~Log[10]~ '(Adjusted p-value)'), 
       color = "Threshold", tag = " ") +
  theme(legend.title = element_blank(), plot.title = element_text(size = ma_volc_title_size)) + 
  egg::theme_article(base_size = ma_volc_title_size) + 
  annotate("text", x = Inf, y = Inf, label = paste("Sig. genes:", n_sig_deseq_p_lfc), hjust = 1.05, vjust = 1.2,
           size = ma_volc_title_size/3),

ggplot(res_limma_annot, aes(x = log2MeanGeTMM, y = log2FoldChange)) +
  geom_point(aes(color = padj < 0.05), alpha = 0.5, size = 1) +
  scale_color_manual(values = c("TRUE" = "salmon", "FALSE" = "lightgrey"), name = "Threshold") +
  theme_minimal() +
  labs(title = "MA Plot: limma-trend",
       y = bquote(~Log[2]~ 'fold GeTMM change'),
       x = bquote(~Log[2]~ 'Mean(GeTMM)'),
       color = "Threshold", tag = "B") +
  geom_hline(yintercept = 0, color = "blue") +
  scale_x_continuous(trans = 'log2') +
  coord_cartesian(ylim = c(-3, 3)) +
  theme(legend.position = "right", 
        legend.title = element_blank(),
        plot.title = element_text(size = ma_volc_title_size)) + 
  egg::theme_article(base_size = ma_volc_title_size) + 
  annotate("text", x = Inf, y = -2.8, label = paste("Sig. genes:", n_sig_limma_p), hjust = "right",
           size = ma_volc_title_size/2.5),

EnhancedVolcano(res_limma_annot,
    lab = res_limma_annot$Preferred_name,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.05,
    FCcutoff = 1,
    pointSize = 1,
    labSize = 2,
    colAlpha = 0.5,
    legendPosition = 'right',
    legendLabSize = 12,
    legendIconSize = 4.0,
    drawConnectors = TRUE,
    widthConnectors = 0.2,
    max.overlaps = Inf) + 
    theme_minimal() +
  coord_cartesian(xlim = c(-4, 4)) +
  labs(title = "Volcano: limma-trend",
       x = bquote(~Log[2]~ 'fold GeTMM change'),
       y = bquote(~Log[10]~ '(Adjusted p-value)'), 
       color = "Threshold", tag = " ") +
  theme(legend.title = element_blank(), plot.title = element_text(size = ma_volc_title_size)) + 
  egg::theme_article(base_size = ma_volc_title_size) + 
  annotate("text", x = Inf, y = Inf, label = paste("Sig. genes:", n_sig_limma_p_lfc), hjust = 1.05, vjust = 1.2,
           size = ma_volc_title_size/3),

  ncol=2, nrow=2)
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
ggsave("./Output_figures_n_plots/fig2.1_MA_Volcano.svg", plot=plot_ma_volc, device=CairoSVG, width = 8, height = 6) # height 5.5 for 3*4 plot
ggsave("./Output_figures_n_plots/fig2.1_MA_Volcano.pdf", plot=plot_ma_volc, device=CairoPDF, width = 8, height = 6)
ggsave("./Output_figures_n_plots/fig2.1_MA_Volcano.png", plot=plot_ma_volc, width = 8, height = 6, dpi=400)

grid.draw(plot_ma_volc)

# Plotting
ma_volc_title_size <- 7  # Set the title size

plot_ma_volc_full <- grid.arrange(

ggplot(res_deseq_annot_full, aes(x = log2MeanGeTMM, y = log2FoldChange)) +
  geom_point(aes(color = padj < 0.05), alpha = 0.5, size = 1) +
  scale_color_manual(values = c("TRUE" = "salmon", "FALSE" = "lightgrey"), name = "Threshold") +
  theme_minimal() +
  labs(title = "MA Plot: DESeq2, full dataset",
       y = bquote(~Log[2]~ 'fold GeTMM change'),
       x = bquote(~Log[2]~ 'Mean(GeTMM)'),
       color = "Threshold", tag = "A") +
  geom_hline(yintercept = 0, color = "blue") +
  scale_x_continuous(trans = 'log2') +
  coord_cartesian(ylim = c(-3, 3)) +
  theme(legend.position = "right", 
        legend.title = element_blank(),
        plot.title = element_text(size = ma_volc_title_size)) + 
  egg::theme_article(base_size = ma_volc_title_size) + 
  annotate("text", x = Inf, y = -2.8, label = paste("Sig. genes:", n_sig_deseq_p_full), hjust = "right",
           size = ma_volc_title_size/2.5),

EnhancedVolcano(res_deseq_annot_full,
    lab = res_deseq_annot_full$Preferred_name,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.05,
    FCcutoff = 1,
    pointSize = 1,
    labSize = 2,
    colAlpha = 0.5,
    legendPosition = 'right',
    legendLabSize = 12,
    legendIconSize = 4.0,
    drawConnectors = TRUE,
    widthConnectors = 0.2,
    max.overlaps = Inf) + 
    theme_minimal() +
  coord_cartesian(xlim = c(-4, 4)) +
  labs(title = "Volcano: DESeq2, full dataset",
       x = bquote(~Log[2]~ 'fold GeTMM change'),
       y = bquote(~Log[10]~ '(Adjusted p-value)'), 
       color = "Threshold", tag = " ") +
  theme(legend.title = element_blank(), plot.title = element_text(size = ma_volc_title_size)) + 
  egg::theme_article(base_size = ma_volc_title_size) + 
  annotate("text", x = Inf, y = Inf, label = paste("Sig. genes:", n_sig_deseq_p_lfc_full), hjust = 1.05, vjust = 1.2,
           size = ma_volc_title_size/3),

  ncol=2, nrow=1)
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 3 rows containing missing values (`geom_point()`).
ggsave("./Output_figures_n_plots/supplementary_MA_Volcano_full.svg", plot=plot_ma_volc_full, device=CairoSVG, width = 8, height = 4) # height 5.5 for 3*4 plot
ggsave("./Output_figures_n_plots/supplementary_MA_Volcano_full.pdf", plot=plot_ma_volc_full, device=CairoPDF, width = 8, height = 4)
ggsave("./Output_figures_n_plots/supplementary_MA_Volcano_full.png", plot=plot_ma_volc_full, width = 8, height = 4, dpi=400)

grid.draw(plot_ma_volc_full)

# Identify differentially abundant genes from limma and DESeq results
da_limma <- rownames(res_limma_annot[res_limma_annot$padj < p_threshold & abs(res_limma_annot$log2FoldChange) > lfc_threshold, ])
da_deseq <- rownames(res_deseq_annot[res_deseq_annot$padj < p_threshold & abs(res_deseq_annot$log2FoldChange) > lfc_threshold, ])

# Prepare data for visualization
# Create a dataframe with all unique genes
# Calculate overlaps and unique genes for each method
overlap <- length(intersect(da_limma, da_deseq))
only_limma <- length(setdiff(da_limma, da_deseq))
only_deseq <- length(setdiff(da_deseq, da_limma))
neither <- nrow(res_limma_annot) - (overlap + only_limma + only_deseq)

# Print matrix
overlap_matrix <- matrix(c(overlap, only_deseq, only_limma, neither), nrow=2)
rownames(overlap_matrix) <- c("DA in DESeq", "Not DA in DESeq")
colnames(overlap_matrix) <- c("DA in limma", "Not DA in limma")
print(overlap_matrix)
##                 DA in limma Not DA in limma
## DA in DESeq             269              25
## Not DA in DESeq          81            4062
# Data preparation
overlap_gene_df <- data.frame(
  gene = union(da_limma, da_deseq)
)

# Add columns indicating whether each gene is present in limma or DESeq results
overlap_gene_df$limma <- overlap_gene_df$gene %in% da_limma
overlap_gene_df$deseq <- overlap_gene_df$gene %in% da_deseq

# UpSet Plot using ComplexUpset with modifications
overlap_gene_df$gene <- as.character(overlap_gene_df$gene)
sets <- c("limma", "deseq")
upset_complex <- ComplexUpset::upset(overlap_gene_df, sets) +
  ggtitle("Identified DA genes") +
  labs(title = "Overlap of Differentially Abundant Genes") +
  theme_minimal() +
  theme(
    axis.title.x=element_blank(),
    axis.title.y=element_blank(),
    axis.text.x=element_blank(),
    axis.text.y=element_blank(),
    axis.ticks=element_blank()
  ) +
  egg::theme_article()

# Save the ComplexUpset Plot as PNG and PDF
ggsave(filename = "./Output_figures_n_plots/fig2.2_overlap_plot_upset_complex.png", plot = upset_complex, width = 6, height = 4, dpi = 300)
ggsave(filename = "./Output_figures_n_plots/fig2.2_overlap_plot_upset_complex.pdf", plot = upset_complex, width = 6, height = 4)
# adding annotation to counts
log2getmm_df <- log2getmm %>% as.data.frame()

log2getmm_deseq <- log2getmm_df %>% 
  tibble::rownames_to_column(var = "rowname_col") %>%
  inner_join(res_deseq_annot %>% rownames_to_column(var = "rowname_col"), by = "rowname_col") %>% 
  tibble::column_to_rownames("rowname_col")

log2getmm_deseq_full <- log2getmm_df %>% 
  tibble::rownames_to_column(var = "rowname_col") %>%
  inner_join(res_deseq_annot_full %>% rownames_to_column(var = "rowname_col"), by = "rowname_col") %>% 
  tibble::column_to_rownames("rowname_col")

log2getmm_limma <- log2getmm_df %>% 
  tibble::rownames_to_column(var = "rowname_col") %>%
  inner_join(res_limma_annot %>% rownames_to_column(var = "rowname_col"), by = "rowname_col") %>% 
  tibble::column_to_rownames("rowname_col")

# Optionally, display the tables using DT for interactive exploration
# log2getmm_deseq %>% DT::datatable(rownames = F)
# log2getmm_deseq_full %>% DT::datatable(rownames = F)
# log2getmm_limma %>% DT::datatable(rownames = F)
# Adjusted p-value threshold
p_threshold <- 0.05
lfc_threshold <- 1

# both lfc and p value
sig_deseq <- log2getmm_deseq %>% filter(padj < p_threshold) 
sig_deseq_full <- log2getmm_deseq_full %>% filter(padj < p_threshold) 
sig_limma <- log2getmm_limma %>% filter(padj < p_threshold) 

# both lfc and p value
DA_deseq <- log2getmm_deseq %>% filter(padj < p_threshold) %>% filter(abs(log2FoldChange) > lfc_threshold)
DA_deseq_full <- log2getmm_deseq_full %>% filter(padj < p_threshold) %>% filter(abs(log2FoldChange) > lfc_threshold)
DA_limma <- log2getmm_limma %>% filter(padj < p_threshold) %>% filter(abs(log2FoldChange) > lfc_threshold)

# Filtering for COG is not NA
cog_sig_deseq <-  sig_deseq[!is.na(sig_deseq$COG_category), ]
cog_sig_deseq_full <-  sig_deseq_full[!is.na(sig_deseq_full$COG_category), ]
cog_sig_limma <- sig_limma[!is.na(sig_limma$COG_category), ]

# Filtering for COG is not NA
cog_DA_deseq <-  DA_deseq[!is.na(DA_deseq$COG_category), ]
cog_DA_deseq_full <-  DA_deseq_full[!is.na(DA_deseq_full$COG_category), ]
cog_DA_limma <- DA_limma[!is.na(DA_limma$COG_category), ]
# Identify overlapping genes from overlap_gene_df
overlap_genes <- overlap_gene_df %>% 
  filter(limma == TRUE & deseq == TRUE) %>% 
  pull(gene)

# Create a subset of cog_DA_deseq with only the overlapping genes
sig_overlap <- DA_deseq %>% 
  filter(row.names(.) %in% overlap_genes)

cog_sig_overlap <-  sig_overlap[!is.na(sig_overlap$COG_category), ]
cog_descriptions <- data.frame(
  COG_category = c("J", "A", "K", "L", "B", "D", "Y", "V", "T", 
                   "M", "N", "Z", "W", "U", "O", "X", "C", "G", 
                   "E", "F", "H", "I", "P", "Q", "R", "S"),
  COG_functional_family = c(
  "J - TRANSLATION, RIBOSOMAL STRUCTURE AND BIOGENESIS",
  "A - RNA PROCESSING AND MODIFICATION",
  "K - TRANSCRIPTION",
  "L - REPLICATION, RECOMBINATION AND REPAIR",
  "B - CHROMATIN STRUCTURE AND DYNAMICS",
  "D - CELL CYCLE CONTROL, CELL DIVISION, CHROMOSOME PARTITIONING",
  "Y - NUCLEAR STRUCTURE",
  "V - DEFENSE MECHANISMS",
  "T - SIGNAL TRANSDUCTION MECHANISMS",
  "M - CELL WALL/MEMBRANE/ENVELOPE BIOGENESIS",
  "N - CELL MOTILITY",
  "Z - CYTOSKELETON",
  "W - EXTRACELLULAR STRUCTURES",
  "U - INTRACELLULAR TRAFFICKING, SECRETION, AND VESICULAR TRANSPORT",
  "O - POSTTRANSLATIONAL MODIFICATION, PROTEIN TURNOVER, CHAPERONES",
  "X - MOBILOME: PROPHAGES, TRANSPOSONS",
  "C - ENERGY PRODUCTION AND CONVERSION",
  "G - CARBOHYDRATE TRANSPORT AND METABOLISM",
  "E - AMINO ACID TRANSPORT AND METABOLISM",
  "F - NUCLEOTIDE TRANSPORT AND METABOLISM",
  "H - COENZYME TRANSPORT AND METABOLISM",
  "I - LIPID TRANSPORT AND METABOLISM",
  "P - INORGANIC ION TRANSPORT AND METABOLISM",
  "Q - SECONDARY METABOLITES BIOSYNTHESIS, TRANSPORT AND CATABOLISM",
  "R - GENERAL FUNCTION PREDICTION ONLY",
  "S - FUNCTION UNKNOWN")
)

cog_colors <- c(
  "M" = "#890000",
  "K" = "#CC79A7",
  "L" = "#673AB7",
  "H" = "#000079",
  "E" = "#008080",
  "G" = "#006000",
  "C" = "#76FF03", 
  "T" = "#FFEB3B",
  "P" = "#FF7000",
  "U" = "#F15C80",
  "J" = "#9C27B0", 
  "I" = "#03A9F4", 
  "V" = "#657D9B",
  "O" = "#C5FA30",
  "F" = "#FFB000",
  "D" = "#D8BFD8", 
  "Q" = "#795548",
  "S" = "#9E9E9E")
# Function to process COG category data
process_COG_category <- function(dataframe, cog_descriptions) {
  # Filter out NAs and categorize genes based on log2FoldChange
  filtered_data <- dataframe %>% 
    filter(!is.na(COG_category)) %>%
    mutate(representation = ifelse(log2FoldChange > 0, "Overrepresented", "Underrepresented"))
  
  # Calculate and adjust counts for each COG_category and representation
  count_data <- filtered_data %>%
    group_by(COG_category, representation) %>%
    summarise(count = n(), .groups = "drop") %>%
    mutate(count = ifelse(representation == "Underrepresented", -count, count))
  
  # Expand multi-letter COG_category rows and calculate adjusted counts
  expanded_data <- count_data %>%
    rowwise() %>%
    mutate(COG_category = strsplit(as.character(COG_category), ""), 
           count = count / length(COG_category)) %>%
    unnest(cols = c(COG_category)) %>%
    group_by(COG_category, representation) %>%
    summarise(count = sum(count), .groups = "drop") %>%
    arrange(COG_category) %>%
    ungroup()
    
  # Calculate absolute sum for each COG_category and join descriptions
  ordering <- expanded_data %>%
    group_by(COG_category) %>%
    summarise(abs_sum = sum(abs(count)), .groups = "drop") %>%
    arrange(desc(abs_sum)) %>%
    pull(COG_category)

  final_data <- expanded_data %>%
    left_join(cog_descriptions, by = "COG_category")
  
  # Ensure "S" is the lowest level
  ordered_levels <- c("S", setdiff(ordering, "S"))
  
  # Reverse the order for the legend
  ordered_reverse <- rev(ordered_levels)
  
  # Convert COG_category to factor and set the levels
  final_data$COG_category <- factor(final_data$COG_category, levels = ordered_levels)
  final_data$COG_category_rev <- factor(final_data$COG_category, levels = ordered_reverse)
  
  final_data <- final_data %>% mutate(COG_category = factor(COG_category, levels = ordering))

  return(final_data)
}

# Define the plotting function
plot_cog <- function(data, total, title_tag, cog_colors, cog_descriptions, cog_title_size) {
  ggplot(data, aes(y = COG_category, x = count, fill = COG_category)) +
    geom_bar(stat = "identity", width = 0.7, color = "black") +
    geom_vline(xintercept = 0, color = "black", size = 1) + 
    geom_text(aes(label = sprintf("%.2f%%", 100 * abs(count) / total), hjust = ifelse(count < 0, 1.2, -0.2)), size = 3, position = position_dodge(0.76)) +
    labs(title = paste("Functional Profile based on COG Categories:", title_tag), y = "Reversed COG Category", x = "Number of Genes") +
    scale_fill_manual(values = cog_colors, name = "COG Categories", labels = setNames(cog_descriptions$COG_functional_family, cog_descriptions$COG_category)) +
    scale_x_continuous(limits = c(-20, 70)) +
    theme_minimal() +
    theme(legend.position = "right", legend.key.size = unit(0.2, "cm")) +
    egg::theme_article(base_size = cog_title_size) +
    annotate("text", x = 70, y = length(unique(data$COG_category)), label = paste("Genes with COG annotation:", round(total)), hjust = "right", size = 2.5)
}

# Define reverse plot function for COG categories legend extraction
plot_cog_reverse <- function(data, total, title_tag, cog_colors, cog_descriptions, cog_title_size) {
  ggplot(data, aes(y = COG_category_rev, x = count, fill = COG_category_rev)) +
    geom_bar(stat = "identity", width = 0.7, color = "black") +
    geom_vline(xintercept = 0, color = "black", size = 1) + 
    geom_text(aes(label = sprintf("%.2f%%", 100 * abs(count) / total), hjust = ifelse(count < 0, 1.2, -0.2)), size = 3, position = position_dodge(0.76)) +
    labs(title = paste("Functional Profile based on COG Categories:", title_tag), y = "Reversed COG Category", x = "Number of Genes") +
    scale_fill_manual(values = cog_colors, name = "COG Categories", labels = setNames(cog_descriptions$COG_functional_family, cog_descriptions$COG_category)) +
    scale_x_continuous(limits = c(-20, 70)) +
    theme_minimal() +
    theme(legend.position = "right", legend.key.size = unit(0.2, "cm")) +
    egg::theme_article(base_size = cog_title_size) +
    annotate("text", x = 70, y = length(unique(data$COG_category_rev)), label = paste("Genes with COG annotation:", round(total)), hjust = "right", size = 2.5)
}

# Process COG category data and calculate totals
cog_deseq <- process_COG_category(DA_deseq, cog_descriptions) 
cog_limma <- process_COG_category(DA_limma, cog_descriptions)
cog_overlap <- process_COG_category(sig_overlap, cog_descriptions)

total_deseq <- sum(abs(cog_deseq$count))
total_limma <- sum(abs(cog_limma$count))
total_overlap <- sum(abs(cog_overlap$count))

# COG title font size
cog_title_size = 9

# Create and arrange the plots
plot_deseq <- plot_cog(cog_deseq, total_deseq, "DESeq2", cog_colors, cog_descriptions, cog_title_size)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
plot_limma <- plot_cog(cog_limma, total_limma, "limma", cog_colors, cog_descriptions, cog_title_size)
plot_overlap <- plot_cog(cog_overlap, total_overlap, "Consensus DA genes", cog_colors, cog_descriptions, cog_title_size)

plot_deseq_rev <- plot_cog_reverse(cog_deseq, total_deseq, "DESeq2", cog_colors, cog_descriptions, cog_title_size)
plot_limma_rev <- plot_cog_reverse(cog_limma, total_limma, "limma", cog_colors, cog_descriptions, cog_title_size)
plot_overlap_rev <- plot_cog_reverse(cog_overlap, total_overlap, "Consensus DA genes", cog_colors, cog_descriptions, cog_title_size)

plot_cog_deseq <- grid.arrange(plot_deseq, plot_deseq_rev, ncol = 2)
## Warning: Removed 1 rows containing missing values (`position_stack()`).
## Warning: Removed 1 rows containing missing values (`geom_text()`).
## Warning: Removed 1 rows containing missing values (`position_stack()`).
## Warning: Removed 1 rows containing missing values (`geom_text()`).

plot_cog_limma <- grid.arrange(plot_limma, plot_limma_rev, ncol = 2)
## Warning: Removed 1 rows containing missing values (`position_stack()`).
## Removed 1 rows containing missing values (`geom_text()`).
## Warning: Removed 1 rows containing missing values (`position_stack()`).
## Warning: Removed 1 rows containing missing values (`geom_text()`).

plot_cog_overlap <- grid.arrange(plot_overlap, plot_overlap_rev, ncol = 2)

# Save the plots
ggsave("./Output_figures_n_plots/supplementary_COG_deseq.svg", plot=plot_cog_deseq, device=CairoSVG, width = 20, height = 4)
ggsave("./Output_figures_n_plots/supplementary_COG_deseq.pdf", plot=plot_cog_deseq, device=CairoPDF, width = 20, height = 4)
ggsave("./Output_figures_n_plots/supplementary_COG_deseq.png", plot=plot_cog_deseq, width = 20, height = 4, dpi=400)

ggsave("./Output_figures_n_plots/supplementary_COG_limma.svg", plot=plot_cog_limma, device=CairoSVG, width = 20, height = 4)
ggsave("./Output_figures_n_plots/supplementary_COG_limma.pdf", plot=plot_cog_limma, device=CairoPDF, width = 20, height = 4)
ggsave("./Output_figures_n_plots/supplementary_COG_limma.png", plot=plot_cog_limma, width = 20, height = 4, dpi=400)

ggsave("./Output_figures_n_plots/fig2.3_COG_overlap.svg", plot=plot_cog_overlap, device=CairoSVG, width = 20, height = 4)
ggsave("./Output_figures_n_plots/fig2.3_COG_overlap.pdf", plot=plot_cog_overlap, device=CairoPDF, width = 20, height = 4)
ggsave("./Output_figures_n_plots/fig2.3_COG_overlap.png", plot=plot_cog_overlap, width = 20, height = 4, dpi=400)
# For Directional hypergeometric test
relative_count_COG_function <- function(df, remove_S) {
  if(remove_S == TRUE){
    relative_count_df <-   
      df[!is.na(df$COG_category), ] %>% 
      filter(COG_category != "S")
  }else{
    relative_count_df <-   
      df[!is.na(df$COG_category), ]
  }
  relative_count_df <- relative_count_df %>% 
    mutate(representation = ifelse(log2FoldChange > 0, "Overrepresented", "Underrepresented")) %>% 
    select(log2FoldChange, COG_category, representation) %>% 
    group_by(COG_category, representation) %>%
    summarise(count = n(), .groups = "drop") %>% 
    mutate(count = ifelse(representation == "Underrepresented", -count, count)) %>% 
    rowwise() %>%
    mutate(COG_category = strsplit(as.character(COG_category), "")) %>%
    mutate(count = count / length(COG_category)) %>% 
    unnest(cols = c(COG_category)) %>% 
    group_by(COG_category, representation) %>% 
    summarise(count = round(abs(sum(count))), .groups = "drop") %>% 
    arrange(COG_category) %>% ungroup() %>% 
    mutate(COG_category = str_c(COG_category, representation, sep = "_")) %>%
    select(-representation)
  return(relative_count_df)
}

hypergeometric_directional <- function(background_df, sig_df, remove_S){
counted_background <- relative_count_COG_function(background_df, remove_S) %>% 
    dplyr::rename(CogNotsig = count) %>% 
    mutate(NotcogNotsig =(sum(CogNotsig) - CogNotsig))
  
  counted_da <- relative_count_COG_function(sig_df, remove_S) %>% 
    dplyr::rename(CogSig = count) %>% 
    mutate(NotcogSig = (sum(CogSig) - CogSig))
  
  full_ora_matrix_df <- left_join(counted_background, counted_da, by="COG_category") %>%  
    mutate_all(~replace(., is.na(.), 0))
  
  # Separate over-representation and under-representation
  up_ora_df <- full_ora_matrix_df %>%
    filter(str_detect(COG_category, "Overrepresented"))
  
  down_ora_df <- full_ora_matrix_df %>%
    filter(str_detect(COG_category, "Underrepresented"))

  # Calculating Fisher's exact test for each category and adjusting p-values
  p_values_up <- apply(up_ora_df, 1, function(row) {
    # Extracting the necessary values and ensuring they are numeric
    CogSig <- as.numeric(row['CogSig'])
    CogNotsig <- as.numeric(row['CogNotsig'])
    NotcogSig <- as.numeric(row['NotcogSig'])
    NotcogNotsig <- as.numeric(row['NotcogNotsig'])
  
    # Constructing the contingency table
    contingency_table <- matrix(c(CogSig, CogNotsig, NotcogSig, NotcogNotsig), nrow = 2)
  
    # Performing Fisher's exact test
      fisher.test(contingency_table, alternative = "greater")$p.value
  })
  
  # Calculating Fisher's exact test for each category and adjusting p-values
  p_values_down <- apply(down_ora_df, 1, function(row) {
    # Extracting the necessary values and ensuring they are numeric
    CogSig <- as.numeric(row['CogSig'])
    CogNotsig <- as.numeric(row['CogNotsig'])
    NotcogSig <- as.numeric(row['NotcogSig'])
    NotcogNotsig <- as.numeric(row['NotcogNotsig'])
  
    # Constructing the contingency table
    contingency_table <- matrix(c(CogSig, CogNotsig, NotcogSig, NotcogNotsig), nrow = 2)
  
    # Performing Fisher's exact test
      fisher.test(contingency_table, alternative = "less")$p.value
  })
  
  # Adding the  p-values to your dataframe
  up_ora_df$p_value <- p_values_up
  down_ora_df$p_value <- p_values_down
  
  # # Adjusting for multiple comparisons using the Benjamini-Hochberg method
  # up_ora_df$adj_p_value <- p.adjust(up_ora_df$p_value, method = "BH")
  # down_ora_df$adj_p_value <- p.adjust(down_ora_df$p_value, method = "BH")
  
  # Combining the results into a single dataframe
  final_results <- bind_rows(up_ora_df, down_ora_df)
  final_results$adj_p_value <- p.adjust(final_results$p_value, method = "BH")
  
  final_results <- final_results %>% 
    mutate(enrichment = -log(p_value, base = 10)) %>%
    arrange(p_value) %>% 
    separate(COG_category, into = c("COG_category", "representation"), sep = "_")
  
  # Returning the final results
  return(final_results)
}

# usage
enrich_res_DA <- hypergeometric_directional(background_df=res_deseq_annot_full, sig_df=DA_deseq_full, remove_S=FALSE)
enrich_res_DA_noS <- hypergeometric_directional(background_df=res_deseq_annot_full, sig_df=DA_deseq_full, remove_S=TRUE)
enrich_res_Overlap <- hypergeometric_directional(background_df=res_deseq_annot_full, sig_df=cog_sig_overlap, remove_S=FALSE)
enrich_res_Overlap_noS <- hypergeometric_directional(background_df=res_deseq_annot_full, sig_df=cog_sig_overlap, remove_S=TRUE)
cog_title_size = 9

plot_cog_pathway_enrichment <- function(cog_pval_df, order = "none", title_tag = "enrichment") {
  
  print(paste("Order argument received:", order)) # Debugging line

  # Check if order is either "none", "reverse", or other valid options
  if (!order %in% c("none", "reverse")) {
    stop("Invalid order argument. Please use 'none' or 'reverse'.")
  }
  
  cog_enrichment_df <- cog_pval_df %>%
    mutate(enrichment = if_else(representation == "Underrepresented", -enrichment, enrichment)) %>%
    arrange(desc(abs(enrichment)))
  
  # Set factor levels based on ordering parameter
  if (order == "reverse") {
    levels = unique(cog_enrichment_df$COG_category)
  } else {  # "none" or other cases
    levels = rev(unique(cog_enrichment_df$COG_category))
  }
  
  # Convert COG_category to a factor with specified levels
  cog_enrichment_df$COG_category <- factor(cog_enrichment_df$COG_category, levels = levels)
  
  # Assuming cog_colors and cog_descriptions are predefined and available in your environment
  if (!exists("cog_colors") || !exists("cog_descriptions")) {
    stop("cog_colors or cog_descriptions not found in the environment")
  }

  # Create the ggplot
  plot_pathway <- ggplot(cog_enrichment_df, aes(y = COG_category, x = enrichment, fill = COG_category)) +
    geom_bar(stat = "identity", width = 0.7, color = "black") +
    geom_vline(aes(xintercept = 0), color = "black", size = 0.07) +
    geom_vline(xintercept = log10(0.05), color = "darkgrey", linetype = "dashed", size = 0.07) +
    geom_vline(xintercept = -log10(0.05), color = "darkgrey", linetype = "dashed", size = 0.07) +

    labs(title = paste("COG Enrichment: ", title_tag),
         y = "COG Category",
         x = "-log10(P value)") +
    scale_fill_manual(values = cog_colors, 
                      name = "COG Functional Family",
                      labels = setNames(cog_descriptions$COG_functional_family,
                                        cog_descriptions$COG_category)) +
    theme_minimal() +
    theme(legend.position = "right",
          legend.key.size = unit(0.2, "cm")) +
    egg::theme_article(base_size = cog_title_size)
  
  return(plot_pathway)
}

plot_enrich_DA <- grid.arrange(
  plot_cog_pathway_enrichment(enrich_res_DA, order = "reverse", title_tag = "Hypergeometric P-value"), 
  plot_cog_pathway_enrichment(enrich_res_DA, order = "none", title_tag = "Hypergeometric P-value"), 
  ncol = 2
)

## [1] "Order argument received: reverse"
## [1] "Order argument received: none"
plot_enrich_DA_noS <- grid.arrange(
  plot_cog_pathway_enrichment(enrich_res_DA_noS, order = "reverse", title_tag = "Hypergeometric P-value"), 
  plot_cog_pathway_enrichment(enrich_res_DA_noS, order = "none", title_tag = "Hypergeometric P-value"), 
  ncol = 2
)

## [1] "Order argument received: reverse"
## [1] "Order argument received: none"
plot_enrich_Overlap <- grid.arrange(
  plot_cog_pathway_enrichment(enrich_res_Overlap, order = "reverse", title_tag = "Hypergeometric P-value"), 
  plot_cog_pathway_enrichment(enrich_res_Overlap, order = "none", title_tag = "Hypergeometric P-value"), 
  ncol = 2
)

## [1] "Order argument received: reverse"
## [1] "Order argument received: none"
plot_enrich_Overlap_noS <- grid.arrange(
  plot_cog_pathway_enrichment(enrich_res_Overlap_noS, order = "reverse", title_tag = "Hypergeometric P-value"), 
  plot_cog_pathway_enrichment(enrich_res_Overlap_noS, order = "none", title_tag = "Hypergeometric P-value"), 
  ncol = 2
)

## [1] "Order argument received: reverse"
## [1] "Order argument received: none"
ggsave("./Output_figures_n_plots/supplementary_enrichment_DA.svg", plot = plot_enrich_DA, device = CairoSVG, width = 20, height = 4)
ggsave("./Output_figures_n_plots/supplementary_enrichment_DA.pdf", plot = plot_enrich_DA, device = CairoPDF, width = 20, height = 4)
ggsave("./Output_figures_n_plots/supplementary_enrichment_DA.png", plot = plot_enrich_DA, width = 20, height = 4, dpi = 400)

# ggsave("./Output_figures_n_plots/supplementary_enrichment_DA_noS.svg", plot = plot_enrich_DA_noS, device = CairoSVG, width = 20, height = 4)
# ggsave("./Output_figures_n_plots/supplementary_enrichment_DA_noS.pdf", plot = plot_enrich_DA_noS, device = CairoPDF, width = 20, height = 4)
# ggsave("./Output_figures_n_plots/supplementary_enrichment_DA_noS.png", plot = plot_enrich_DA_noS, width = 20, height = 4, dpi = 400)

ggsave("./Output_figures_n_plots/fig2.4_enrichment_Overlap.svg", plot = plot_enrich_Overlap, device = CairoSVG, width = 20, height = 4)
ggsave("./Output_figures_n_plots/fig2.4_enrichment_Overlap.pdf", plot = plot_enrich_Overlap, device = CairoPDF, width = 20, height = 4)
ggsave("./Output_figures_n_plots/fig2.4_enrichment_Overlap.png", plot = plot_enrich_Overlap, width = 20, height = 4, dpi = 400)

# ggsave("./Output_figures_n_plots/fig2.4_enrichment_Overlap_noS.svg", plot = plot_enrich_Overlap_noS, device = CairoSVG, width = 20, height = 4)
# ggsave("./Output_figures_n_plots/fig2.4_enrichment_Overlap_noS.pdf", plot = plot_enrich_Overlap_noS, device = CairoPDF, width = 20, height = 4)
# ggsave("./Output_figures_n_plots/fig2.4_enrichment_Overlap_noS.png", plot = plot_enrich_Overlap_noS, width = 20, height = 4, dpi = 400)

For annotation: http://pfam-legacy.xfam.org/family/PF02518 https://www.ebi.ac.uk/interpro/entry/pfam/PF07660/

Look up each domain in a database like PFAM or InterPro to understand its function. Cross-reference that function with literature or pathway databases like KEGG or Reactome to see if there’s any mention of involvement in purinergic signaling.

https://www.researchgate.net/figure/Pfam-domain-and-KEGG-enzyme-enrichment-analyses-of-DEGs-between-R-idaeus-Var-Amira_fig3_343558433 https://www.researchgate.net/figure/Bubble-plot-showing-the-enrichment-for-GO-KEGG-pathway-and-Pfam-domain-of_fig2_335417714 https://www.researchgate.net/figure/Enriched-protein-categories-and-Pfam-domains-A-Proteins-for-which-fragments-increased_fig6_263291958

expand_PFAMs <- function(sig_DA_df) {
  # Dataframe with gene to PFAM mapping (one to many) in rows, where duplicate relative counts are counted as 1/n(PFAM) per gene
  cog_pfam_df <- sig_DA_df %>% 
    mutate(relative_count = 1) %>% # initialize relative gene count as 1 for each gene
    select(log2MeanGeTMM, stat, log2FoldChange, padj, COG_category, PFAMs, relative_count)

  # Replace any NA values in the PFAMs column with "DUF"
  cog_pfam_df$PFAMs[is.na(cog_pfam_df$PFAMs)] <- "DUF"

  # 1. Add a column named "n_PFAMs"
  cog_pfam_df$n_PFAMs <- sapply(str_count(cog_pfam_df$PFAMs, ","), `+`, 1)

  # 2. Separate the PFAMs column into multiple rows
  expanded_df <- cog_pfam_df %>%
    tidyr::separate_rows(PFAMs, sep = ",") 

  # If a row had more than one name in "PFAMs" column AND some of the names include "DUF", 
  # only duplicate rows for names without "DUF", and then names including "DUF" should be removed.
  expanded_df <- expanded_df %>%
    group_by(log2MeanGeTMM, stat, log2FoldChange, padj, COG_category, relative_count, n_PFAMs) %>%
    filter(!(n_PFAMs > 1 & str_detect(PFAMs, "DUF"))) %>%
    ungroup()
  
  ## 1. Replace NAs in PFAMs with "DUF"
  expanded_df$PFAMs[is.na(expanded_df$PFAMs)] <- "DUF"
  ## 3. Replace names in PFAMs that start with "DUF" to just "DUF"
  expanded_df$PFAMs <- ifelse(str_starts(expanded_df$PFAMs, "DUF"), "DUF", expanded_df$PFAMs)
  
  ## 2. Adjust the "relative_count" column
  expanded_df$relative_count <- expanded_df$relative_count / expanded_df$n_PFAMs
  
  
  
  ## 4. Handle COG_category with more than one character
  # Count the number of COG_Category assigned to each gene
  expanded_df$n_COGs <- sapply(str_count(expanded_df$COG_category, ""), `+`)
  
  ## 5 duplicate COGs 
  expanded_df <- expanded_df %>%
    tidyr::unnest(COG_category = strsplit(as.character(COG_category), ""))%>%
    mutate(relative_count = relative_count / n_COGs)
  
  ## 6 Remove rows with PFAMs == DUF
  #expanded_df <- expanded_df[!grepl("DUF", expanded_df$PFAMs),]

  return(expanded_df)
}

cog_pfam_df_deseq <- expand_PFAMs(cog_DA_deseq)
## Warning: `unnest()` has a new interface. See `?unnest` for details.
## ℹ Try `df %>% unnest(c(COG_category))`, with `mutate()` if needed.
cog_pfam_df_limma <- expand_PFAMs(cog_DA_limma)
## Warning: `unnest()` has a new interface. See `?unnest` for details.
## ℹ Try `df %>% unnest(c(COG_category))`, with `mutate()` if needed.
cog_pfam_df_overlap <- expand_PFAMs(cog_sig_overlap)
## Warning: `unnest()` has a new interface. See `?unnest` for details.
## ℹ Try `df %>% unnest(c(COG_category))`, with `mutate()` if needed.
unique(cog_pfam_df_overlap$PFAMs)
##   [1] "FAD_syn"         "Flavokinase"     "M20_dimer"       "Peptidase_M20"  
##   [5] "Peptidase_M28"   "Polysacc_synt"   "Polysacc_synt_C" "HEPN"           
##   [9] "Acetyltransf_3"  "DUF"             "TPR_12"          "TPR_8"          
##  [13] "FecR"            "Sigma70_r2"      "Sigma70_r4_2"    "Beta_helix"     
##  [17] "Lipase_GDSL_2"   "OMP_b-brl_2"     "Reg_prop"        "Inhibitor_I69"  
##  [21] "Peptidase_C10"   "HTH_18"          "Acyl_transf_3"   "TolB_like"      
##  [25] "Peptidase_S41"   "Peptidase_S24"   "Peptidase_S26"   "Glycos_transf_2"
##  [29] "Kdo"             "GFO_IDH_MocA"    "GFO_IDH_MocA_C"  "Cupin_2"        
##  [33] "HTH_19"          "HTH_3"           "LANC_like"       "Glyco_trans_1_4"
##  [37] "Glyco_transf_4"  "Glycos_transf_1" "Peptidase_C39"   "Thioredoxin_4"  
##  [41] "VKOR"            "CN_hydrolase"    "PseudoU_synth_2" "S4"             
##  [45] "LVIVD"           "Metallophos_2"   "HTH_17"          "FtsX"           
##  [49] "MacB_PCD"        "Arm-DNA-bind_5"  "Phage_int_SAM_5" "Phage_integrase"
##  [53] "Bro-N"           "PMT_2"           "TPR_1"           "TPR_16"         
##  [57] "TPR_2"           "Glyco_hydro_88"  "DHODB_Fe-S_bind" "FAD_binding_6"  
##  [61] "NAD_binding_1"   "CarbopepD_reg_2" "Plug"            "AraC_binding"   
##  [65] "BPL_LplA_LipB"   "Lip_prot_lig_C"  "Radical_SAM"     "Phage_int_SAM_1"
##  [69] "Phage_int_SAM_4" "Patatin"         "NACHT"           "TIR_2"          
##  [73] "AAA_10"          "HTH_26"          "Mfa2"            "PepSY_like"     
##  [77] "Methyltransf_11" "Methyltransf_23" "Aminotran_1_2"   "Aminotran_3"    
##  [81] "BATS"            "PTR2"            "CTP_transf_1"    "RsfS"           
##  [85] "Pirin"           "DXPR_C"          "DXP_redisom_C"   "DXP_reductoisom"
##  [89] "Amidohydro_1"    "adh_short"       "Steroid_dh"      "BNR_2"          
##  [93] "NifU_N"          "Bac_DNA_binding" "Epimerase_2"     "RecA"           
##  [97] "EpsG"            "Poly_export"     "SLBB"            "TonB_dep_Rec"   
## [101] "DnaB_2"          "Wzy_C"           "Glyco_trans_4_2" "Bac_transf"     
## [105] "Hexapep"         "AAA_31"          "Wzz"             "OEP"            
## [109] "APS_kinase"      "CoA_binding"     "Ligase_CoA"      "Succ_CoA_lig"   
## [113] "HATPase_c"       "LemA"            "Acetyltransf_1"  "Acetyltransf_10"
## [117] "NAD_binding_8"   "Fic"             "Mfa_like_1"      "MukB"           
## [121] "Ferritin"        "Glycos_transf_4" "MR_MLE_C"        "MR_MLE_N"       
## [125] "Peptidase_C14"   "HU-DNA_bdg"      "AsnC_trans_reg"  "HTH_24"         
## [129] "OMP_b-brl"       "OmpA"            "Trans_reg_C"     "Glyco_trans_4_4"
## [133] "FGE-sulfatase"   "PEGA"            "Trypsin_2"       "HTH_Crp_2"      
## [137] "cNMP_binding"    "RicinB_lectin_2" "Transglut_core"  "Peptidase_M23"  
## [141] "YdfA_immunity"   "ApbA"            "AA_kinase"       "PUA"            
## [145] "Hydrolase_4"     "Caps_assemb_Wzi" "HAD_2"           "Glycos_transf_N"
## [149] "MM_CoA_mutase"   "CobD_Cbib"       "CobS"            "POR_N"          
## [153] "TPP_enzyme_C"    "FumaraseC_C"     "Lyase_1"         "LytTR"          
## [157] "Response_reg"    "His_kinase"      "Rho_RNA_bind"    "Beta-lactamase" 
## [161] "Glyco_hydro_3"   "Glyco_hydro_3_C" "ExbD"            "Rotamase"       
## [165] "SurA_N_3"        "Abi"             "ABC_tran"        "Queuosine_synth"
## [169] "adh_short_C2"    "TP_methylase"    "Lactamase_B_2"   "QueC"           
## [173] "DXP_synthase_N"  "E1_dh"           "Transket_pyr"    "Transketolase_C"
## [177] "TrkH"            "Exo_endo_phos"   "PfkB"            "PAD_porph"      
## [181] "URO-D"           "B12-binding"     "B12-binding_2"   "rhaM"           
## [185] "STN"
cog_colors <- c(
  "M" = "#890000",
  "K" = "#CC79A7",
  "L" = "#673AB7",
  "H" = "#000079",
  "E" = "#008080",
  "G" = "#006000",
  "C" = "#76FF03", 
  "T" = "#FFEB3B",
  "P" = "#FF7000",
  "U" = "#F15C80",
  "J" = "#9C27B0", 
  "I" = "#03A9F4", 
  "V" = "#657D9B",
  "O" = "#C5FA30",
  "F" = "#FFB000",
  "D" = "#D8BFD8", 
  "Q" = "#795548",
  "S" = "#9E9E9E",
  "Joint COG" = "#ECECEC")

# Function to process DataFrame
process_dataframe <- function(df) {
  df$representation <- ifelse(df$log2FoldChange > 0, "Overrepresented", "Underrepresented")
  df$relative_count <- ifelse(df$representation == "Underrepresented", -df$relative_count, df$relative_count)
  return(df)
}

# Function to calculate relative counts
calc_relative_counts <- function(df) {
  return(df %>%
           group_by(COG_category, PFAMs, representation) %>%
           summarise(relative_count = sum(relative_count), .groups = "drop"))
}
# Function to generate ordering for COG and PFAM
generate_ordering <- function(relative_count_df) {
  cog_ordering <- relative_count_df %>%
    group_by(COG_category) %>%
    summarise(total_abs_sum = sum(abs(relative_count)), .groups = "drop") %>%
    arrange(-total_abs_sum) %>%  # Note the negative sign for descending order
    pull(COG_category)
  
  pfam_ordering <- relative_count_df %>%
    group_by(COG_category, PFAMs) %>%
    summarise(total_abs_sum = sum(abs(relative_count)), .groups = "drop") %>%
    arrange(COG_category, -total_abs_sum) %>%  # Note the negative sign for descending order within each COG_category
    pull(PFAMs)
  
  unique_combinations <- relative_count_df %>%
    select(COG_category, PFAMs) %>%
    distinct()
  
  unique_combinations$combined <- paste(unique_combinations$COG_category, unique_combinations$PFAMs)
  
  ordering <- unique_combinations$combined[order(match(unique_combinations$COG_category, cog_ordering), 
                                                 -match(unique_combinations$PFAMs, pfam_ordering))]
  
  reversed_ordering <- rev(ordering)
  
  return(list(cog_ordering = cog_ordering, pfam_ordering = pfam_ordering, reversed_ordering = reversed_ordering))
}

plot_relative_counts <- function(data, cog_pfam_title_size = 9) {
    total <- sum(abs(data$relative_count))
    plot <- ggplot(data, aes(y = combined_order, x = relative_count, fill = COG_category)) +
        geom_bar(stat = "identity", width = 0.7, color = "black") +
        geom_vline(aes(xintercept = 0), color = "black", size = 1) +
        geom_text(aes(label = sprintf("%.2f%%", 100 * abs(relative_count) / total), 
                      hjust = ifelse(relative_count < 0, 1.2, -0.2)), 
                  size = 3,
                  position = position_dodge(0.76)) +
        labs(title = "Functional Profile based on COG Categories and PFAMs",
             y = "COG Category and PFAM",
             x = "Relative Gene Ratio (%)") +
        scale_fill_manual(values = cog_colors, 
                          name = "COG Functional Family") +
        scale_x_continuous(limits = c(-2.3, 6.5)) +
        scale_y_discrete(labels = function(x) substr(x, 3, nchar(x))) +
        theme_minimal() +
        theme(legend.position = "right",
              legend.key.size = unit(0.2, "cm")) +
        egg::theme_article(base_size = cog_pfam_title_size) +
        annotate("text", x = max(data$relative_count) * 0.9, y = length(unique(data$combined_order)) + 1,
                 label = paste("Genes with COG and PFAM annotation:", round(total)), hjust = "right",
                 size = 2.5)
    return(plot)
}

# Main function
generate_plots <- function(input_dataframe, cog_pfam_title_size = 9, split=FALSE, split_n=NA) {
  processed_df <- process_dataframe(input_dataframe)
  relative_count_df <- calc_relative_counts(processed_df)
  ordering <- generate_ordering(relative_count_df)
  
  relative_count_df$combined_order <- factor(paste(relative_count_df$COG_category, relative_count_df$PFAMs), 
                                             levels = ordering$reversed_ordering)
  
  if(split==TRUE){
    left_plot <- relative_count_df[relative_count_df$COG_category %in% ordering$cog_ordering[1:split_n], ]
    right_plot <- relative_count_df[!relative_count_df$COG_category %in% ordering$cog_ordering[1:split_n], ]
    
    first_plot <- plot_relative_counts(left_plot)
    second_plot <- plot_relative_counts(right_plot)
    
    COG_PFAM_plot <- grid.arrange(first_plot, second_plot, ncol=2)
  }else{
    plot <- relative_count_df[relative_count_df$COG_category %in% ordering$cog_ordering, ]
    COG_PFAM_plot <- plot_relative_counts(plot)
  }
  return(COG_PFAM_plot)
}

# Usage
COG_PFAM_overlap_split <- generate_plots(cog_pfam_df_overlap, split = TRUE, split_n = 4)
## Warning: Removed 2 rows containing missing values (`position_stack()`).
## Warning: Removed 2 rows containing missing values (`geom_text()`).
## Warning: Removed 1 rows containing missing values (`geom_text()`).

ggsave("./Output_figures_n_plots/fig3.1_COG_PFAM_overlap_split.svg", plot=COG_PFAM_overlap_split, device=CairoSVG, width = 20, height = 11.3)
ggsave("./Output_figures_n_plots/fig3.1_COG_PFAM_overlap_split.pdf", plot=COG_PFAM_overlap_split, device=CairoPDF, width = 20, height = 11.3)
ggsave("./Output_figures_n_plots/fig3.1_COG_PFAM_overlap_split.png", plot=COG_PFAM_overlap_split, width = 20, height = 11.3, dpi=400)

COG_PFAM_plot_overlap <- generate_plots(cog_pfam_df_overlap)

ggsave("./Output_figures_n_plots/fig3_COG_PFAM_overlap_split.svg", plot=COG_PFAM_plot_overlap, device=CairoSVG, width = 11, height = 24.3)
## Warning: Removed 2 rows containing missing values (`position_stack()`).
## Warning: Removed 2 rows containing missing values (`geom_text()`).
## Warning: Removed 1 rows containing missing values (`geom_text()`).
ggsave("./Output_figures_n_plots/fig3_COG_PFAM_overlap_split.pdf", plot=COG_PFAM_plot_overlap, device=CairoPDF, width = 11, height = 24.3)
## Warning: Removed 2 rows containing missing values (`position_stack()`).
## Warning: Removed 2 rows containing missing values (`geom_text()`).
## Warning: Removed 1 rows containing missing values (`geom_text()`).
ggsave("./Output_figures_n_plots/fig3_COG_PFAM_overlap_split.png", plot=COG_PFAM_plot_overlap, width = 11, height = 24.3, dpi=400)
## Warning: Removed 2 rows containing missing values (`position_stack()`).
## Warning: Removed 2 rows containing missing values (`geom_text()`).
## Warning: Removed 1 rows containing missing values (`geom_text()`).
COG_colors_multiple_cat <- c(
  "M" = "#890000",
  "K" = "#CC79A7",
  "L" = "#673AB7",
  "H" = "#000079",
  "E" = "#008080",
  "G" = "#006000",
  "C" = "#76FF03", 
  "T" = "#FFEB3B",
  "P" = "#FF7000",
  "U" = "#F15C80",
  "J" = "#9C27B0", 
  "I" = "#03A9F4", 
  "V" = "#657D9B",
  "O" = "#C5FA30",
  "F" = "#FFB000",
  "D" = "#D8BFD8", 
  "Q" = "#795548",
  "S" = "#9E9E9E",
  "Joint COG" = "#ECECEC"
  )

circular_genome_plot_function <- function(df, includegroup, radius, imgname, width, height, save, legend_lab, COG) {
  
  df <- df %>% mutate(mean_p1_sim = rowMeans(select(., p1_sim1, p1_sim2, p1_sim3))) %>% # calculating mean(log2(GeTMM)) for simulated P1 counts
    mutate(mean_wt_sim = rowMeans(select(., wt_sim1, wt_sim2, wt_sim3))) %>% # calculating mean(log2(GeTMM)) for simulated P1 counts
    select(,c("p1","wt","mean_p1_sim", "mean_wt_sim", "log2FoldChange", 
              "stat", "padj", "start", "end", "gene_length", "strand", 
              "product","COG_category", "Description", "Preferred_name")) %>% 
    mutate(line = NA) %>% 
    mutate(COG_category = replace_na(COG_category, "S")) %>% 
    mutate(
      COG_category = 
        ifelse(
          nchar(COG_category) > 1, 
          "Joint COG", 
          COG_category
        )) %>%
    mutate(
      log2FoldChange = 
        ifelse(
          padj > 0.05, 0,
          log2FoldChange
        )) %>% 
    mutate(
      log2FoldChange =
        ifelse(
            log2FoldChange < -2.5, 
            -2.5, 
            ifelse(
              log2FoldChange > 2.5, 
              2.5, 
              log2FoldChange
            )))%>%
#    mutate(
#      pval_sig = 
#        ifelse(
#          padj < 0.05, "*",
#          NA
#        )) %>%
    mutate(
      pval_sig = 
        ifelse(
          padj < 0.01, "**",
          NA
        )) %>%
    mutate(
      pval_sig = 
        ifelse(
          padj < 0.001, "***",
          pval_sig
        ))
  
  calculate_angle_known_gene <- function(df) {
    df_calculate <- df
    rows_before <- data.frame(matrix(ncol = ncol(df_calculate), nrow = as.integer(nrow(df)*0.025)))
    colnames(rows_before) <- colnames(df_calculate)
    rows_after <- data.frame(matrix(ncol = ncol(df_calculate), nrow = as.integer(nrow(df)*0.025)))
    colnames(rows_after) <- colnames(df_calculate)
    df_calculate <- rbind(rows_before, df_calculate, rows_after)
    df_calculate$index <- 1:nrow(df_calculate) 
    df_calculate <- mutate(df_calculate, angle = 90 - 360 * (index - min(index)) / (max(index) - min(index)))
    return(df_calculate)
  }
  
  circular_df <- calculate_angle_known_gene(df)
  circular_df <- calculate_angle_known_gene(circular_df) %>% 
    mutate(start_position = case_when(index == 1 ~ radius, TRUE ~ 0)) %>% 
    rowwise() %>% mutate(max_value = max(c_across(includegroup))) %>% 
    pivot_longer(cols = c(includegroup,start_position)) %>% 
    mutate(value = value) %>% 
    mutate(gene_sig_pos = ifelse(name == "p1_sig" & value ==12, gene, NA)) %>% 
    mutate(gene_sig_neg = ifelse(name == "p1_sig" & value ==0, gene, NA)) %>% 
    filter(!is.na(value))

  circular_genome_lfc <- function(circular_df, imgname, width, height, save, legend_lab) {
    circular_df$COG_category <- as.factor(circular_df$COG_category)

    plot_gg <- 
      circular_df %>% 
      ggplot() +
    
      # Experimental (EV) heatmaps
      geom_rect(data = subset(circular_df, name == "p1"), aes(xmin = index, xmax = (index+0.98), 
                                                                    ymin = -4.5, ymax = -2.5, fill = value), 
                color = alpha("#8B005B", 0), 
                size = 0.00) +
      geom_rect(data = subset(circular_df, name == "wt"), aes(xmin = index, xmax = (index+0.98), 
                                                                   ymin = -6.5, ymax = -4.5, fill = value), 
                color = alpha("#007D75", 0),
                size = 0.00) +
    
      # adding white line between exp and sim
      geom_rect(data = subset(circular_df, name == "line"), aes(xmin = index, xmax = (index+0.98), 
                                                                   ymin = -7, ymax = -6.5, fill = value), 
                alpha = 0.00, 
                color = alpha("white", 0),
                size = 0.00) +
    
      # Simulated (ASF519) heatmaps 
      geom_rect(data = subset(circular_df, name == "mean_p1_sim"), aes(xmin = index, xmax = (index+0.98), 
                                                              ymin = -9, ymax = -7, fill = value), 
                color = alpha("#FF80E0", 0),
                size = 0.00) +

      geom_rect(data = subset(circular_df, name == "mean_wt_sim"), aes(xmin = index, xmax = (index+0.98), 
                                                              ymin = -11, ymax = -9, fill = value), 
                color = alpha("#66FFFF", 0),
                size = 0.00) +
      # adding white line between p1 and p1_sim groups
      geom_rect(data = subset(circular_df, name == "line"), aes(xmin = index, xmax = (index+0.98), 
                                                              ymin = -14, ymax = -11, fill = value), 
                color = alpha("white", 0),
                size = 0.00) +
      geom_rect(data = subset(circular_df, name == "start_position"), aes(xmin = index, xmax = (index+0.98), 
                                                                          ymin = -30, ymax = -14),
                alpha = 0,
                color = "white",
                size = 0.00) +
      scale_fill_gradientn(colors = c("darkblue", "white", "darkred"))+
    
      geom_text(x=0, y=0.5, label="log2fold Change", size = 2) +
      geom_text(x=0, y=-0.5, label="(p.adj < 0.05)", size = 2) +
      geom_text(x=0, y=-3.5, label="p1: EV", size = 2) +
      geom_text(x=0, y=-5.5, label="wt: EV", size = 2) +
      geom_text(x=0, y=-8, label="mean(p1_sim): ASF519", size = 2) +
      geom_text(x=0, y=-10, label="mean(wt_sim): ASF519", size = 2) +
    
      # Adding Log2fold change plot
      geom_rect(data = circular_df, 
             aes(xmin = index, 
                 xmax = index + 0.98, 
                 ymin = 0, 
                 ymax = log2FoldChange)) +
      theme_minimal() +
      egg::theme_article() +
      theme(axis.text.y = element_blank(),  # Remove y-axis text
            axis.ticks.y = element_blank()) +  # Remove y-axis ticks)
          
      # Make plot circular
      coord_polar() +
        
      # Add labels
      labs(fill = legend_lab,
           title="Heatmap: log2(GeTMM) ordered by Genomic Location", 
           x=" ", y=" ", tag = " ") +
      # labeling the gene order within genome
      scale_x_continuous(breaks = seq(-250, 4700, 250))
    # Save plot using cairo
    if(save == TRUE){
    ggsave(plot_gg, file=imgname, width=width, height=height, bg = "transparent", device=cairo_pdf())
    plot_gg
    dev.off()
    }
    if(save == FALSE){
    plot_gg 
    }
  }

  circular_genome_lfc_COG <- function(circular_df, imgname, width, height, save, legend_lab) {
    circular_df$COG_category <- as.factor(circular_df$COG_category)

    plot_gg <- 
      circular_df %>% 
      ggplot() +
    
      # Experimental (EV) heatmaps
      geom_rect(data = subset(circular_df, name == "p1"), aes(xmin = index, xmax = (index+0.98), 
                                                                  ymin = -4.5, ymax = -2.5),
              alpha = 0.00, 
              color = alpha("white", 0),
              size = 0.00) +
      geom_rect(data = subset(circular_df, name == "wt"), aes(xmin = index, xmax = (index+0.98), 
                                                                 ymin = -6.5, ymax = -4.5),
              alpha = 0.00, 
              color = alpha("white", 0),
              size = 0.00) +
    
      # adding white line between exp and sim
      geom_rect(data = subset(circular_df, name == "line"), aes(xmin = index, xmax = (index+0.98), 
                                                                 ymin = -7, ymax = -6.5),
              alpha = 0.00, 
              color = alpha("white", 0),
              size = 0.00) +
    
      # Simulated (ASF519) heatmaps 
      geom_rect(data = subset(circular_df, name == "mean_p1_sim"), aes(xmin = index, xmax = (index+0.98), 
                                                            ymin = -9, ymax = -7),
              alpha = 0.00, 
              color = alpha("white", 0),
              size = 0.00) +

      geom_rect(data = subset(circular_df, name == "mean_wt_sim"), aes(xmin = index, xmax = (index+0.98), 
                                                            ymin = -11, ymax = -9),
              alpha = 0.00, 
              color = alpha("white", 0),
              size = 0.00) +
      # adding white line between p1 and p1_sim groups
      geom_rect(data = subset(circular_df, name == "line"), aes(xmin = index, xmax = (index+0.98), 
                                                            ymin = -14, ymax = -11),
              color = alpha("white", 0),
              size = 0.00) +
      geom_rect(data = subset(circular_df, name == "start_position"), aes(xmin = index, xmax = (index+0.98), 
                                                                        ymin = -30, ymax = -14),
              alpha = 0,
              color = "white",
              size = 0.00) +
      geom_text(aes(x = index+0.49, y = 2.5, label = pval_sig, angle = angle),color="black", size = 0.2, alpha=0.3, family = "Times",fontface = "bold", hjust = 0) +
    
      geom_text(x=0, y=0.5, label="log2fold Change", size = 2) +
    
      # Adding Log2fold change plot
      geom_rect(data = circular_df, 
           aes(xmin = index, 
               xmax = index + 0.98, 
               ymin = 0, 
               ymax = log2FoldChange,
               fill = COG_category)) +
      scale_fill_manual(values = COG_colors_multiple_cat) +
    
      theme_minimal() +
      egg::theme_article() +
      theme(axis.text.y = element_blank(),  # Remove y-axis text
            axis.ticks.y = element_blank()) +  # Remove y-axis ticks)
          
      # Make plot circular
      coord_polar() +
      
      # Add labels
      labs(fill = legend_lab,
           title="Heatmap: log2(GeTMM) ordered by Genomic Location", 
           x=" ", y=" ", tag = " ") +
      # labeling the gene order within genome
      scale_x_continuous(breaks = seq(-250, 4700, 250))
    # Save plot using cairo
    if(save == TRUE){
    ggsave(plot_gg, file=imgname, width=width, height=height, bg = "transparent", device=cairo_pdf())
    plot_gg
    dev.off()
    }
    if(save == FALSE){
    plot_gg 
    }
  }
  return(circular_df)
  if (COG) {
    circular_genome_lfc_COG(circular_df, imgname, width, height, save, legend_lab)
  } else {
    circular_genome_lfc(circular_df, imgname, width, height, save, legend_lab)
  }
}

# Usage:
operon_lfc_df <- circular_genome_plot_function(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "/path/to/save/image.pdf", width = 9, height = 9, save = FALSE, legend_lab = "log2(GeTMM)", COG = TRUE)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `max_value = max(c_across(includegroup))`.
## ℹ In row 1.
## Caused by warning:
## ! Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(includegroup)
## 
##   # Now:
##   data %>% select(all_of(includegroup))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
#circular_genome_plot_function(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "/path/to/save/image.pdf", width = 9, height = 9, save = FALSE, legend_lab = "log2(GeTMM)", COG = FALSE)

circular_genome_plot_function(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "./Output_figures_n_plots/Circular_heatmap_COG.pdf", width = 9, height = 9, save = TRUE, legend_lab = "log2(GeTMM)", COG = TRUE)
## # A tibble: 22,637 × 19
##    log2FoldChange  stat  padj start   end gene_length strand product
##             <dbl> <dbl> <dbl> <int> <int>       <int> <chr>  <chr>  
##  1             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  2             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  3             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  4             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  5             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  6             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  7             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  8             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  9             NA    NA    NA    NA    NA          NA <NA>   <NA>   
## 10             NA    NA    NA    NA    NA          NA <NA>   <NA>   
## # ℹ 22,627 more rows
## # ℹ 11 more variables: COG_category <chr>, Description <chr>,
## #   Preferred_name <chr>, pval_sig <chr>, index <int>, angle <dbl>,
## #   max_value <dbl>, name <chr>, value <dbl>, gene_sig_pos <lgl>,
## #   gene_sig_neg <lgl>
circular_genome_plot_function(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "./Output_figures_n_plots/Circular_heatmap_noCOG.pdf", width = 9, height = 9, save = TRUE, legend_lab = "log2(GeTMM)", COG = FALSE)
## # A tibble: 22,637 × 19
##    log2FoldChange  stat  padj start   end gene_length strand product
##             <dbl> <dbl> <dbl> <int> <int>       <int> <chr>  <chr>  
##  1             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  2             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  3             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  4             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  5             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  6             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  7             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  8             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  9             NA    NA    NA    NA    NA          NA <NA>   <NA>   
## 10             NA    NA    NA    NA    NA          NA <NA>   <NA>   
## # ℹ 22,627 more rows
## # ℹ 11 more variables: COG_category <chr>, Description <chr>,
## #   Preferred_name <chr>, pval_sig <chr>, index <int>, angle <dbl>,
## #   max_value <dbl>, name <chr>, value <dbl>, gene_sig_pos <lgl>,
## #   gene_sig_neg <lgl>
circular_genome_plot_function(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "./Output_figures_n_plots/Circular_heatmap_COG.png", width = 9, height = 9, save = TRUE, legend_lab = "log2(GeTMM)", COG = TRUE)
## # A tibble: 22,637 × 19
##    log2FoldChange  stat  padj start   end gene_length strand product
##             <dbl> <dbl> <dbl> <int> <int>       <int> <chr>  <chr>  
##  1             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  2             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  3             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  4             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  5             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  6             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  7             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  8             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  9             NA    NA    NA    NA    NA          NA <NA>   <NA>   
## 10             NA    NA    NA    NA    NA          NA <NA>   <NA>   
## # ℹ 22,627 more rows
## # ℹ 11 more variables: COG_category <chr>, Description <chr>,
## #   Preferred_name <chr>, pval_sig <chr>, index <int>, angle <dbl>,
## #   max_value <dbl>, name <chr>, value <dbl>, gene_sig_pos <lgl>,
## #   gene_sig_neg <lgl>
circular_genome_plot_function(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "./Output_figures_n_plots/Circular_heatmap_noCOG.png", width = 9, height = 9, save = TRUE, legend_lab = "log2(GeTMM)", COG = FALSE)
## # A tibble: 22,637 × 19
##    log2FoldChange  stat  padj start   end gene_length strand product
##             <dbl> <dbl> <dbl> <int> <int>       <int> <chr>  <chr>  
##  1             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  2             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  3             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  4             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  5             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  6             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  7             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  8             NA    NA    NA    NA    NA          NA <NA>   <NA>   
##  9             NA    NA    NA    NA    NA          NA <NA>   <NA>   
## 10             NA    NA    NA    NA    NA          NA <NA>   <NA>   
## # ℹ 22,627 more rows
## # ℹ 11 more variables: COG_category <chr>, Description <chr>,
## #   Preferred_name <chr>, pval_sig <chr>, index <int>, angle <dbl>,
## #   max_value <dbl>, name <chr>, value <dbl>, gene_sig_pos <lgl>,
## #   gene_sig_neg <lgl>

COG_colors_multiple_cat <- c(
  "M" = "#890000",
  "K" = "#CC79A7",
  "L" = "#673AB7",
  "H" = "#000079",
  "E" = "#008080",
  "G" = "#006000",
  "C" = "#76FF03", 
  "T" = "#FFEB3B",
  "P" = "#FF7000",
  "U" = "#F15C80",
  "J" = "#9C27B0", 
  "I" = "#03A9F4", 
  "V" = "#657D9B",
  "O" = "#C5FA30",
  "F" = "#FFB000",
  "D" = "#D8BFD8", 
  "Q" = "#795548",
  "S" = "#9E9E9E",
  "Joint COG" = "#ECECEC"
  )

circular_genome_plot_function_v2 <- function(df, includegroup, radius, imgname, width, height, save, legend_lab, COG) {
  
  df <- df %>% mutate(mean_p1_sim = rowMeans(select(., p1_sim1, p1_sim2, p1_sim3))) %>% # calculating mean(log2(GeTMM)) for simulated P1 counts
    mutate(mean_wt_sim = rowMeans(select(., wt_sim1, wt_sim2, wt_sim3))) %>% # calculating mean(log2(GeTMM)) for simulated P1 counts
    select(,c("p1","wt","mean_p1_sim", "mean_wt_sim", "log2FoldChange", 
              "stat", "padj", "start", "end", "gene_length", "strand", 
              "product","COG_category", "Description", "Preferred_name")) %>% 
    mutate(line = NA) %>% 
    mutate(COG_category = replace_na(COG_category, "S")) %>% 
    mutate(
      COG_category = 
        ifelse(
          nchar(COG_category) > 1, 
          "Joint COG", 
          COG_category
        )) %>%
    mutate(
      log2FoldChange = 
        ifelse(
          padj > 0.05, 0,
          log2FoldChange
        )) %>% 
    mutate(
      log2FoldChange =
        ifelse(
            log2FoldChange < -2.5, 
            -2.5, 
            ifelse(
              log2FoldChange > 2.5, 
              2.5, 
              log2FoldChange
            )))%>%
#    mutate(
#      pval_sig = 
#        ifelse(
#          padj < 0.05, "*",
#          NA
#        )) %>%
    mutate(
      pval_sig = 
        ifelse(
          padj < 0.01, "**",
          NA
        )) %>%
    mutate(
      pval_sig = 
        ifelse(
          padj < 0.001, "***",
          pval_sig
        ))
  
  calculate_angle_known_gene <- function(df) {
    df_calculate <- df
    rows_before <- data.frame(matrix(ncol = ncol(df_calculate), nrow = as.integer(nrow(df)*0.025)))
    colnames(rows_before) <- colnames(df_calculate)
    rows_after <- data.frame(matrix(ncol = ncol(df_calculate), nrow = as.integer(nrow(df)*0.025)))
    colnames(rows_after) <- colnames(df_calculate)
    df_calculate <- rbind(rows_before, df_calculate, rows_after)
    df_calculate$index <- 1:nrow(df_calculate) 
    df_calculate <- mutate(df_calculate, angle = 90 - 360 * (index - min(index)) / (max(index) - min(index)))
    return(df_calculate)
  }
  
  circular_df <- calculate_angle_known_gene(df)
  circular_df <- calculate_angle_known_gene(circular_df) %>% 
    mutate(start_position = case_when(index == 1 ~ radius, TRUE ~ 0)) %>% 
    rowwise() %>% mutate(max_value = max(c_across(includegroup))) %>% 
    pivot_longer(cols = c(includegroup,start_position)) %>% 
    mutate(value = value) %>% 
    mutate(gene_sig_pos = ifelse(name == "p1_sig" & value ==12, gene, NA)) %>% 
    mutate(gene_sig_neg = ifelse(name == "p1_sig" & value ==0, gene, NA)) %>% 
    filter(!is.na(value))

  circular_genome_lfc <- function(circular_df, imgname, width, height, save, legend_lab) {
    circular_df$COG_category <- as.factor(circular_df$COG_category)

    plot_gg <- 
      circular_df %>% 
      ggplot() +
    
      # Experimental (EV) heatmaps
      geom_rect(data = subset(circular_df, name == "p1"), aes(xmin = index, xmax = (index+0.98), 
                                                                    ymin = -9.5+2.5, ymax = -7.5+2.5, fill = value), 
                color = alpha("#8B005B", 0), 
                size = 0.00) +
      geom_rect(data = subset(circular_df, name == "wt"), aes(xmin = index, xmax = (index+0.98), 
                                                                   ymin = -11.5+2.5, ymax = -9.5+2.5, fill = value), 
                color = alpha("#007D75", 0),
                size = 0.00) +
    
      # adding white line between exp and sim
      geom_rect(data = subset(circular_df, name == "line"), aes(xmin = index, xmax = (index+0.98), 
                                                                   ymin = -12+2.5, ymax = -11.5+2.5, fill = value), 
                alpha = 0.00, 
                color = alpha("white", 0),
                size = 0.00) +
    
      # Simulated (ASF519) heatmaps 
      geom_rect(data = subset(circular_df, name == "mean_p1_sim"), aes(xmin = index, xmax = (index+0.98), 
                                                              ymin = -14+2.5, ymax = -12+2.5, fill = value), 
                color = alpha("#FF80E0", 0),
                size = 0.00) +

      geom_rect(data = subset(circular_df, name == "mean_wt_sim"), aes(xmin = index, xmax = (index+0.98), 
                                                              ymin = -16+2.5, ymax = -14+2.5, fill = value), 
                color = alpha("#66FFFF", 0),
                size = 0.00) +
      # adding white line between p1 and p1_sim groups
      geom_rect(data = subset(circular_df, name == "line"), aes(xmin = index, xmax = (index+0.98), 
                                                              ymin = -19+2.5, ymax = -16+2.5, fill = value), 
                color = alpha("white", 0),
                size = 0.00) +
      geom_rect(data = subset(circular_df, name == "start_position"), aes(xmin = index, xmax = (index+0.98), 
                                                                          ymin = -30, ymax = -19+2.5),
                alpha = 0,
                color = "white",
                size = 0.00) +
      scale_fill_gradientn(colors = c("darkblue", "white", "darkred"))+
      geom_text(aes(x = index+0.49, y = -11.85+2.5, label = pval_sig, angle = angle),color="black", size = 0.2, alpha=0.3, family = "Times",fontface = "bold", hjust = 0) +
    
    
      geom_text(x=0, y=0.5+2.5, label="log2fold Change", size = 2) +
      geom_text(x=0, y=-0.5+2.5, label="(p.adj < 0.05)", size = 2) +
      geom_text(x=0, y=-8.5+2.5, label="P1: EV", size = 2) +
      geom_text(x=0, y=-10.5+2.5, label="WT: EV", size = 2) +
      geom_text(x=0, y=-13+2.5, label="mean(p1_sim): ASF519", size = 2) +
      geom_text(x=0, y=-15+2.5, label="mean(wt_sim): ASF519", size = 2) +
    
      # Adding Log2fold change plot
      geom_rect(data = circular_df, 
             aes(xmin = index, 
                 xmax = index + 0.98, 
                 ymin = 0, 
                 ymax = log2FoldChange*2)) +
      theme_minimal() +
      egg::theme_article() +
      theme(axis.text.y = element_blank(),  # Remove y-axis text
            axis.ticks.y = element_blank(),  # Remove y-axis ticks
            axis.text.x = element_blank(),  # Remove x-axis text'
            axis.ticks.x = element_blank()) +  # Remove x-axis ticks

      # Make plot circular
      coord_polar() +
        
      # Add labels
      labs(fill = legend_lab,
           title="Heatmap: log2(GeTMM) ordered by Genomic Location", 
           x=" ", y=" ", tag = " ") +
      # labeling the gene order within genome
      scale_x_continuous(breaks = seq(-250, 4700, 250))
    # Save plot using cairo
    if(save == TRUE){
    ggsave(plot_gg, file=imgname, width=width, height=height, bg = "transparent", device=cairo_pdf())
    plot_gg
    dev.off()
    }
    if(save == FALSE){
    plot_gg 
    }
  }

  circular_genome_lfc_COG <- function(circular_df, imgname, width, height, save, legend_lab) {
    circular_df$COG_category <- as.factor(circular_df$COG_category)

    plot_gg <- 
      circular_df %>% 
      ggplot() +
    
      # Experimental (EV) heatmaps
      geom_rect(data = subset(circular_df, name == "p1"), aes(xmin = index, xmax = (index+0.98), 
                                                                  ymin = -9.5+2.5, ymax = -7.5+2.5),
              alpha = 0.00, 
              color = alpha("white", 0),
              size = 0.00) +
      geom_rect(data = subset(circular_df, name == "wt"), aes(xmin = index, xmax = (index+0.98), 
                                                                 ymin = -11.5+2.5, ymax = -9.5+2.5),
              alpha = 0.00, 
              color = alpha("white", 0),
              size = 0.00) +
    
      # adding white line between exp and sim
      geom_rect(data = subset(circular_df, name == "line"), aes(xmin = index, xmax = (index+0.98), 
                                                                 ymin = -12+2.5, ymax = -11.5+2.5),
              alpha = 0.00, 
              color = alpha("white", 0),
              size = 0.00) +
    
      # Simulated (ASF519) heatmaps 
      geom_rect(data = subset(circular_df, name == "mean_p1_sim"), aes(xmin = index, xmax = (index+0.98), 
                                                            ymin = -14+2.5, ymax = -12+2.5),
              alpha = 0.00, 
              color = alpha("white", 0),
              size = 0.00) +

      geom_rect(data = subset(circular_df, name == "mean_wt_sim"), aes(xmin = index, xmax = (index+0.98), 
                                                            ymin = -16+2.5, ymax = -14+2.5),
              alpha = 0.00, 
              color = alpha("white", 0),
              size = 0.00) +
      # adding white line between p1 and p1_sim groups
      geom_rect(data = subset(circular_df, name == "line"), aes(xmin = index, xmax = (index+0.98), 
                                                            ymin = -19+2.5, ymax = -16+2.5),
              color = alpha("white", 0),
              size = 0.00) +
      geom_rect(data = subset(circular_df, name == "start_position"), aes(xmin = index, xmax = (index+0.98), 
                                                                        ymin = -30, ymax = -19+2.5),
              alpha = 0,
              color = "white",
              size = 0.00) +
      geom_text(aes(x = index+0.49, y = -11.85+2.5, label = pval_sig, angle = angle),color="black", size = 0.2, alpha=0.3, family = "Times",fontface = "bold", hjust = 0) +
    
      geom_text(x=0, y=0.5, label="log2fold Change", size = 2) +
    
      # Adding Log2fold change plot
      geom_rect(data = circular_df, 
           aes(xmin = index, 
               xmax = index + 0.98, 
               ymin = 0, 
               ymax = log2FoldChange*2,
               fill = COG_category)) +
      scale_fill_manual(values = COG_colors_multiple_cat) +
    
      theme_minimal() +
      egg::theme_article() +
      theme(axis.text.y = element_blank(),  # Remove y-axis text
            axis.ticks.y = element_blank(),  # Remove y-axis ticks
            axis.text.x = element_blank(),  # Remove x-axis text'
            axis.ticks.x = element_blank()) +  # Remove x-axis ticks
          
      # Make plot circular
      coord_polar() +
      
      # Add labels
      labs(fill = legend_lab,
           title="Heatmap: log2(GeTMM) ordered by Genomic Location", 
           x=" ", y=" ", tag = " ") +
      # labeling the gene order within genome
      scale_x_continuous(breaks = seq(-250, 4700, 250))
    # Save plot using cairo
    if(save == TRUE){
    ggsave(plot_gg, file=imgname, width=width, height=height, bg = "transparent", device=cairo_pdf())
    plot_gg
    dev.off()
    }
    if(save == FALSE){
    plot_gg 
    }
  }

  if (COG) {
    circular_genome_lfc_COG(circular_df, imgname, width, height, save, legend_lab)
  } else {
    circular_genome_lfc(circular_df, imgname, width, height, save, legend_lab)
  }
}

# Usage:
#circular_genome_plot_function_v2(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "/path/to/save/image.pdf", width = 9, height = 9, save = FALSE, legend_lab = "log2(GeTMM)", COG = TRUE)
#circular_genome_plot_function_v2(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "/path/to/save/image.pdf", width = 9, height = 9, save = FALSE, legend_lab = "log2(GeTMM)", COG = FALSE)

circular_genome_plot_function_v2(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "./Output_figures_n_plots/Circular_heatmap_COG_v2.pdf", width = 9, height = 9, save = TRUE, legend_lab = "log2(GeTMM)", COG = TRUE)
circular_genome_plot_function_v2(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "./Output_figures_n_plots/Circular_heatmap_noCOG_v2.pdf", width = 9, height = 9, save = TRUE, legend_lab = "log2(GeTMM)", COG = FALSE)

circular_genome_plot_function_v2(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "./Output_figures_n_plots/Circular_heatmap_COG_v2.png", width = 9, height = 9, save = TRUE, legend_lab = "log2(GeTMM)", COG = TRUE)
circular_genome_plot_function_v2(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "./Output_figures_n_plots/Circular_heatmap_noCOG_v2.png", width = 9, height = 9, save = TRUE, legend_lab = "log2(GeTMM)", COG = FALSE)
COG_colors_multiple_cat <- c(
  "M" = "#890000",
  "K" = "#CC79A7",
  "L" = "#673AB7",
  "H" = "#000079",
  "E" = "#008080",
  "G" = "#006000",
  "C" = "#76FF03", 
  "T" = "#FFEB3B",
  "P" = "#FF7000",
  "U" = "#F15C80",
  "J" = "#9C27B0", 
  "I" = "#03A9F4", 
  "V" = "#657D9B",
  "O" = "#C5FA30",
  "F" = "#FFB000",
  "D" = "#D8BFD8", 
  "Q" = "#795548",
  "S" = "#9E9E9E",
  "Joint COG" = "#ECECEC"
  )

circular_genome_plot_function_v3 <- function(df, includegroup, radius, imgname, width, height, save, legend_lab, COG) {
  
  df <- df %>% mutate(mean_p1_sim = rowMeans(select(., p1_sim1, p1_sim2, p1_sim3))) %>% # calculating mean(log2(GeTMM)) for simulated P1 counts
    mutate(mean_wt_sim = rowMeans(select(., wt_sim1, wt_sim2, wt_sim3))) %>% # calculating mean(log2(GeTMM)) for simulated P1 counts
    select(,c("p1","wt","mean_p1_sim", "mean_wt_sim", "log2FoldChange", 
              "stat", "padj", "start", "end", "gene_length", "strand", 
              "product","COG_category", "Description", "Preferred_name")) %>% 
    mutate(line = NA) %>% 
    mutate(COG_category = replace_na(COG_category, "S")) %>% 
    mutate(
      COG_category = 
        ifelse(
          nchar(COG_category) > 1, 
          "Joint COG", 
          COG_category
        )) %>%
    mutate(
      log2FoldChange = 
        ifelse(
          padj > 0.05, 0,
          log2FoldChange
        )) %>% 
    mutate(
      log2FoldChange =
        ifelse(
            log2FoldChange < -2.5, 
            -2.5, 
            ifelse(
              log2FoldChange > 2.5, 
              2.5, 
              log2FoldChange
            )))%>%
#    mutate(
#      pval_sig = 
#        ifelse(
#          padj < 0.05, "*",
#          NA
#        )) %>%
    mutate(
      pval_sig = 
        ifelse(
          padj < 0.01, "**",
          NA
        )) %>%
    mutate(
      pval_sig = 
        ifelse(
          padj < 0.001, "***",
          pval_sig
        ))
  
  calculate_angle_known_gene <- function(df) {
    df_calculate <- df
    rows_before <- data.frame(matrix(ncol = ncol(df_calculate), nrow = as.integer(nrow(df)*0.025)))
    colnames(rows_before) <- colnames(df_calculate)
    rows_after <- data.frame(matrix(ncol = ncol(df_calculate), nrow = as.integer(nrow(df)*0.025)))
    colnames(rows_after) <- colnames(df_calculate)
    df_calculate <- rbind(rows_before, df_calculate, rows_after)
    df_calculate$index <- 1:nrow(df_calculate) 
    df_calculate <- mutate(df_calculate, angle = 90 - 360 * (index - min(index)) / (max(index) - min(index)))
    return(df_calculate)
  }
  
  circular_df <- calculate_angle_known_gene(df)
  circular_df <- calculate_angle_known_gene(circular_df) %>% 
    mutate(start_position = case_when(index == 1 ~ radius, TRUE ~ 0)) %>% 
    rowwise() %>% mutate(max_value = max(c_across(includegroup))) %>% 
    pivot_longer(cols = c(includegroup,start_position)) %>% 
    mutate(value = value) %>% 
    mutate(gene_sig_pos = ifelse(name == "p1_sig" & value ==12, gene, NA)) %>% 
    mutate(gene_sig_neg = ifelse(name == "p1_sig" & value ==0, gene, NA)) %>% 
    filter(!is.na(value))

  circular_genome_lfc <- function(circular_df, imgname, width, height, save, legend_lab) {
    circular_df$COG_category <- as.factor(circular_df$COG_category)

    plot_gg <- 
      circular_df %>% 
      ggplot() +
    
      # Experimental (EV) heatmaps
      geom_rect(data = subset(circular_df, name == "p1"), aes(xmin = index, xmax = (index+0.98), 
                                                                    ymin = -9.5+2.5, ymax = -7.5+2.5, fill = value), 
                color = alpha("#8B005B", 0), 
                size = 0.00) +
      geom_rect(data = subset(circular_df, name == "wt"), aes(xmin = index, xmax = (index+0.98), 
                                                                   ymin = -11.5+2.5, ymax = -9.5+2.5, fill = value), 
                color = alpha("#007D75", 0),
                size = 0.00) +
    
      # adding white line between exp and sim
      #geom_rect(data = subset(circular_df, name == "line"), aes(xmin = index, xmax = (index+0.98), 
      #                                                             ymin = -12+2.5, ymax = -11.5+2.5, fill = value), 
      #          alpha = 0.00, 
      #          color = alpha("white", 0),
      #          size = 0.00) +
    
      # Simulated (ASF519) heatmaps 
      geom_rect(data = subset(circular_df, name == "mean_p1_sim"), aes(xmin = index, xmax = (index+0.98), 
                                                              ymin = -14+2.5, ymax = -12+2.5, fill = value), 
                color = alpha("#FF80E0", 0),
                size = 0.00) +

      geom_rect(data = subset(circular_df, name == "mean_wt_sim"), aes(xmin = index, xmax = (index+0.98), 
                                                              ymin = -16+2.5, ymax = -14+2.5, fill = value), 
                color = alpha("#66FFFF", 0),
                size = 0.00) +
      # adding white line between p1 and p1_sim groups
      geom_rect(data = subset(circular_df, name == "line"), aes(xmin = index, xmax = (index+0.98), 
                                                              ymin = -19+2.5, ymax = -16+2.5, fill = value), 
                color = alpha("white", 0),
                size = 0.00) +
      geom_rect(data = subset(circular_df, name == "start_position"), aes(xmin = index, xmax = (index+0.98), 
                                                                          ymin = -30, ymax = -19+2.5),
                alpha = 0,
                color = "white",
                size = 0.00) +
      scale_fill_gradientn(colors = c("darkblue", "white", "darkred"))+
      geom_text(aes(x = index+0.49, y = -11.85+2.5, label = pval_sig, angle = angle),color="black", size = 0.2, alpha=0.3, family = "Times",fontface = "bold", hjust = 0) +
    
    
      #geom_text(x=0, y=0.5+2.5, label="log2fold Change", size = 2) +
      #geom_text(x=0, y=-0.5+2.5, label="(p.adj < 0.05)", size = 2) +
      #geom_text(x=0, y=-8.5+2.5, label="P1: EV", size = 2) +
      #geom_text(x=0, y=-10.5+2.5, label="WT: EV", size = 2) +
      #geom_text(x=0, y=-13+2.5, label="mean(p1_sim): ASF519", size = 2) +
      #geom_text(x=0, y=-15+2.5, label="mean(wt_sim): ASF519", size = 2) +
    
      # Adding Log2fold change plot
      #geom_rect(data = circular_df, 
      #       aes(xmin = index, 
      #           xmax = index + 0.98, 
      #           ymin = 0, 
      #           ymax = log2FoldChange*2)) +
      theme_minimal() +
      egg::theme_article() +
      theme(axis.text.y = element_blank(),  # Remove y-axis text
            axis.ticks.y = element_blank(),  # Remove y-axis ticks
            axis.text.x = element_blank(),  # Remove x-axis text'
            axis.ticks.x = element_blank()) +  # Remove x-axis ticks

      # Make plot circular
      coord_polar() +
        
      # Add labels
      labs(fill = legend_lab,
           title="Heatmap: log2(GeTMM) ordered by Genomic Location", 
           x=" ", y=" ", tag = " ") +
      # labeling the gene order within genome
      scale_x_continuous(breaks = seq(-250, 4700, 250))
    # Save plot using cairo
    if(save == TRUE){
    ggsave(plot_gg, file=imgname, width=width, height=height, bg = "transparent", device=cairo_pdf())
    plot_gg
    dev.off()
    }
    if(save == FALSE){
    plot_gg 
    }
  }

  circular_genome_lfc_COG <- function(circular_df, imgname, width, height, save, legend_lab) {
    circular_df$COG_category <- as.factor(circular_df$COG_category)

    plot_gg <- 
      circular_df %>% 
      ggplot() +
    
      # Experimental (EV) heatmaps
      geom_rect(data = subset(circular_df, name == "p1"), aes(xmin = index, xmax = (index+0.98), 
                                                                  ymin = -9.5+2.5, ymax = -7.5+2.5),
              alpha = 0.00, 
              color = alpha("white", 0),
              size = 0.00) +
      geom_rect(data = subset(circular_df, name == "wt"), aes(xmin = index, xmax = (index+0.98), 
                                                                 ymin = -11.5+2.5, ymax = -9.5+2.5),
              alpha = 0.00, 
              color = alpha("white", 0),
              size = 0.00) +
    
      # adding white line between exp and sim
      geom_rect(data = subset(circular_df, name == "line"), aes(xmin = index, xmax = (index+0.98), 
                                                                 ymin = -12+2.5, ymax = -11.5+2.5),
              alpha = 0.00, 
              color = alpha("white", 0),
              size = 0.00) +
    
      # Simulated (ASF519) heatmaps 
      geom_rect(data = subset(circular_df, name == "mean_p1_sim"), aes(xmin = index, xmax = (index+0.98), 
                                                            ymin = -14+2.5, ymax = -12+2.5),
              alpha = 0.00, 
              color = alpha("white", 0),
              size = 0.00) +

      geom_rect(data = subset(circular_df, name == "mean_wt_sim"), aes(xmin = index, xmax = (index+0.98), 
                                                            ymin = -16+2.5, ymax = -14+2.5),
              alpha = 0.00, 
              color = alpha("white", 0),
              size = 0.00) +
      # adding white line between p1 and p1_sim groups
      geom_rect(data = subset(circular_df, name == "line"), aes(xmin = index, xmax = (index+0.98), 
                                                            ymin = -19+2.5, ymax = -16+2.5),
              color = alpha("white", 0),
              size = 0.00) +
      geom_rect(data = subset(circular_df, name == "start_position"), aes(xmin = index, xmax = (index+0.98), 
                                                                        ymin = -30, ymax = -19+2.5),
              alpha = 0,
              color = "white",
              size = 0.00) +
      geom_text(aes(x = index+0.49, y = -11.85+2.5, label = pval_sig, angle = angle),color="black", size = 0.2, alpha=0.3, family = "Times",fontface = "bold", hjust = 0) +
    
      geom_text(x=0, y=0.5, label="log2fold Change", size = 2) +
    
      # Adding Log2fold change plot
      geom_rect(data = circular_df, 
           aes(xmin = index, 
               xmax = index + 0.98, 
               ymin = 0, 
               ymax = log2FoldChange*2,
               fill = COG_category)) +
      scale_fill_manual(values = COG_colors_multiple_cat) +
    
      theme_minimal() +
      egg::theme_article() +
      theme(axis.text.y = element_blank(),  # Remove y-axis text
            axis.ticks.y = element_blank(),  # Remove y-axis ticks
            axis.text.x = element_blank(),  # Remove x-axis text'
            axis.ticks.x = element_blank()) +  # Remove x-axis ticks
          
      # Make plot circular
      coord_polar() +
      
      # Add labels
      labs(fill = legend_lab,
           title="Heatmap: log2(GeTMM) ordered by Genomic Location", 
           x=" ", y=" ", tag = " ") +
      # labeling the gene order within genome
      scale_x_continuous(breaks = seq(-250, 4700, 250))
    # Save plot using cairo
    if(save == TRUE){
    ggsave(plot_gg, file=imgname, width=width, height=height, bg = "transparent", device=cairo_pdf())
    plot_gg
    dev.off()
    }
    if(save == FALSE){
    plot_gg 
    }
  }

  if (COG) {
    circular_genome_lfc_COG(circular_df, imgname, width, height, save, legend_lab)
  } else {
    circular_genome_lfc(circular_df, imgname, width, height, save, legend_lab)
  }
}

# Usage:
#circular_genome_plot_function_v3(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "/path/to/save/image.pdf", width = 9, height = 9, save = FALSE, legend_lab = "log2(GeTMM)", COG = TRUE)
#circular_genome_plot_function_v3(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "/path/to/save/image.pdf", width = 9, height = 9, save = FALSE, legend_lab = "log2(GeTMM)", COG = FALSE)

circular_genome_plot_function_v3(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "./Output_figures_n_plots/fig4_Circular_heatmap_COG.pdf", width = 9, height = 9, save = TRUE, legend_lab = "log2(GeTMM)", COG = TRUE)
## Warning: Removed 18682 rows containing missing values (`geom_text()`).
## Warning: Removed 452 rows containing missing values (`geom_rect()`).
circular_genome_plot_function_v3(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "./Output_figures_n_plots/fig4_Circular_heatmap_noCOG.pdf", width = 9, height = 9, save = TRUE, legend_lab = "log2(GeTMM)", COG = FALSE)
## Warning: Removed 18682 rows containing missing values (`geom_text()`).
circular_genome_plot_function_v3(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "./Output_figures_n_plots/fig4_Circular_heatmap_COG.png", width = 9, height = 9, save = TRUE, legend_lab = "log2(GeTMM)", COG = TRUE)
## Warning: Removed 18682 rows containing missing values (`geom_text()`).
## Removed 452 rows containing missing values (`geom_rect()`).
circular_genome_plot_function_v3(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "./Output_figures_n_plots/fig4_Circular_heatmap_noCOG.png", width = 9, height = 9, save = TRUE, legend_lab = "log2(GeTMM)", COG = FALSE)
## Warning: Removed 18682 rows containing missing values (`geom_text()`).
circular_genome_plot_function_v3(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "./Output_figures_n_plots/fig4_Circular_heatmap_COG.svg", width = 9, height = 9, save = TRUE, legend_lab = "log2(GeTMM)", COG = TRUE)
## Warning: Removed 18682 rows containing missing values (`geom_text()`).
## Removed 452 rows containing missing values (`geom_rect()`).
circular_genome_plot_function_v3(df = log2getmm_deseq_full, includegroup = c("p1","wt","line","mean_p1_sim", "mean_wt_sim"), radius = 50, imgname = "./Output_figures_n_plots/fig4_Circular_heatmap_noCOG.svg", width = 9, height = 9, save = TRUE, legend_lab = "log2(GeTMM)", COG = FALSE)
## Warning: Removed 18682 rows containing missing values (`geom_text()`).
# Read the operon_mapper data (TSV)
operon_data <- read.delim(mapped_operon_path, header = TRUE, stringsAsFactors = FALSE, check.names = FALSE)

# Fill down the Operon numbers for each gene
operon_data$Operon <- zoo::na.locf(operon_data$Operon)

# Filter out rows without position data and select relevant columns
operon_data <- operon_data %>%
  filter(!is.na(PosLeft)) %>%
  select(PosLeft, postRight, Strand, Operon) %>% 
  rename("PosLeft" = "start",
         "postRight" = "end", 
         "Strand" = "strand", 
         "Operon" = "operon")

# Add operon_unit column
operon_data_full <- operon_data %>%
  group_by(operon) %>%
  mutate(operon_unit = ifelse(n() > 1, cur_group_id(), NA)) %>%
  ungroup() %>%
  mutate(operon_unit = ifelse(is.na(operon_unit), NA, 
                              paste0("unit_", match(operon, unique(operon[!is.na(operon_unit)]))))) 
# %>% 
  # rename("PosLeft" = "start",
  #        "postRight" = "end", 
  #        "Strand" = "strand", 
  #        "Operon" = "operon")

# Print the updated dataframe
print(operon_data_full)
## # A tibble: 5,504 × 5
##    start   end strand operon operon_unit
##    <int> <int> <chr>   <int> <chr>      
##  1    28   390 -           1 <NA>       
##  2   624   857 -           2 <NA>       
##  3   993  1595 -           3 unit_1     
##  4  1607  2194 -           3 unit_1     
##  5  2298  3260 +           4 unit_2     
##  6  3297  4352 +           4 unit_2     
##  7  4357  5640 +           4 unit_2     
##  8  5706  7373 -           5 <NA>       
##  9  7457  9148 +           6 unit_3     
## 10  9200 11005 +           6 unit_3     
## # ℹ 5,494 more rows
cat("We find ", length(unique(operon_data_full$operon_unit))," operon units")
## We find  1275  operon units
# Function to combine Preferred_name and PFAMs
combine_preferred_pfam <- function(preferred, pfams) {
  if (is.na(preferred)) {
    return(as.character(pfams))
  } else {
    return(paste0("[", preferred, "]  ", pfams))
  }
}

operon_lfc <- log2getmm_deseq_full %>% left_join(
  operon_data %>% 
    select(-c(strand)),
    by="start"
  ) %>%  
  # counting operon units (that is more than one gene assigned the same operon number)
  group_by(operon) %>%
  mutate(operon_unit = ifelse(n() > 1, cur_group_id(), NA)) %>%
  ungroup() %>%
  mutate(operon_unit = ifelse(is.na(operon_unit), NA, 
                              paste0("unit_", match(operon, unique(operon[!is.na(operon_unit)]))))) %>% 
  select(log2FoldChange, operon_unit, start, end.x, gene_length, strand, padj, COG_category, Preferred_name, PFAMs) %>%
  
  # cleaning COG category for visualization
    mutate(COG_category = replace_na(COG_category, "S")) %>% 
    mutate(
      COG_category = 
        ifelse(
          nchar(COG_category) > 1, 
          "Joint COG", 
          COG_category
        )) %>%
  #To visualize only genes with adj.pvalue < 0.05
    # mutate(
    #   log2FoldChange = 
    #     ifelse(
    #       padj > 0.05, 0,
    #       log2FoldChange
    #     )) %>%
  
  # approximate lfc >2 as 2
    mutate(
      log2FoldChange =
        ifelse(
            log2FoldChange < -2, 
            -2, 
            ifelse(
              log2FoldChange > 2, 
              2, 
              log2FoldChange
            ))) %>%
    mutate(
      pval_sig = 
        ifelse(
          padj < 0.05, "*",
          NA
        )) %>%
    mutate(
      pval_sig = 
        ifelse(
          padj < 0.01, "**",
          pval_sig
        )) %>%
    mutate(
      pval_sig = 
        ifelse(
          padj < 0.001, "***",
          pval_sig
        )) %>% 
  # adding relative gene position
  tibble::rownames_to_column() %>% 
  rename("end.x" = "end",
         "rowname" = "rel_pos")

# relative gene position as numeric, then making it start from 0 (zero-indexing)
operon_lfc$rel_pos <- as.numeric(operon_lfc$rel_pos)
operon_lfc <- operon_lfc %>% 
  mutate(rel_pos = rel_pos-1)

# COGs as factor
operon_lfc$COG_category <- as.factor(operon_lfc$COG_category)

# combining known gene name (Preferred_name) with PFAMs annotation
operon_lfc <- operon_lfc %>%
  mutate(gene_pfam = mapply(combine_preferred_pfam, Preferred_name, PFAMs)) %>%
  mutate(gene_pfam = str_replace_all(gene_pfam, ",", ", ")) %>%
  mutate(gene_name = Preferred_name) %>% 
  mutate(gene_name = str_c("- ", gene_name)) %>% 
  mutate(gene_pfam = str_c("- ", gene_pfam)) %>% 
  select(-c(start, end, gene_length, strand, Preferred_name, PFAMs))


DT::datatable(operon_lfc)
operon_lfc_df <- operon_lfc

# Show the first 40 rows
print(operon_lfc_df, n=40) 
## # A tibble: 4,437 × 8
##    rel_pos log2FoldChange operon_unit     padj COG_category pval_sig gene_pfam  
##      <dbl>          <dbl> <chr>          <dbl> <fct>        <chr>    <chr>      
##  1       0       -2       <NA>        3.31e-17 S            ***      <NA>       
##  2       1       -2       <NA>        1.52e-28 S            ***      <NA>       
##  3       2       -1.91    unit_1      7.78e-25 S            ***      <NA>       
##  4       3       -2       unit_1      1.60e-43 S            ***      <NA>       
##  5       4       -2       unit_2      7.19e-29 H            ***      - [ribF]  …
##  6       5       -2       unit_2      1.94e-51 E            ***      - [argE]  …
##  7       6       -1.51    unit_2      7.15e-19 S            ***      - Polysacc…
##  8       7       -0.942   <NA>        4.52e- 7 Joint COG    ***      - [udk2]  …
##  9       8       -0.680   unit_3      2.17e- 4 P            ***      - Na_Pi_co…
## 10       9       -0.0557  unit_3      8.68e- 1 P            <NA>     - Na_Pi_co…
## 11      10       -1.42    <NA>        1.53e- 5 S            ***      - BetR     
## 12      11        0.0499  <NA>        8.76e- 1 T            <NA>     - GAF, HAT…
## 13      12        0.0353  unit_4      9.18e- 1 K            <NA>     - [rpoN]  …
## 14      13       -1.31    unit_4      5.71e- 3 F            **       - [purE]  …
## 15      14       -0.231   <NA>        3.02e- 1 I            <NA>     - [ispG]  …
## 16      15       -0.0868  unit_5      7.34e- 1 Joint COG    <NA>     - TPR_16, …
## 17      16       -0.502   unit_5      2.48e- 1 S            <NA>     - DUF4292  
## 18      17       -0.124   unit_5      6.78e- 1 D            <NA>     - [yibP]  …
## 19      18       -0.320   <NA>        6.82e- 1 S            <NA>     <NA>       
## 20      19        0.230   <NA>        3.36e- 1 S            <NA>     <NA>       
## 21      20       -0.00781 <NA>        9.81e- 1 L            <NA>     - DUF3987,…
## 22      21       -0.133   <NA>        9.61e- 1 S            <NA>     - DUF4248  
## 23      22       -0.244   unit_6      4.45e- 1 M            <NA>     - HlyD_D23 
## 24      23       -0.265   unit_6      3.49e- 1 V            <NA>     - [bepE_4]…
## 25      24       -0.351   unit_6      3.53e- 1 Joint COG    <NA>     - [tolC]  …
## 26      25        0.132   unit_6      7.28e- 1 K            <NA>     - HTH_18, …
## 27      26       -0.165   <NA>        6.85e- 1 G            <NA>     - Amidohyd…
## 28      27        0.616   unit_7      2.24e- 2 S            *        - GFO_IDH_…
## 29      28        0.418   unit_7      1.00e- 1 G            <NA>     - MFS_1    
## 30      29        0.721   unit_7      7.76e- 2 S            <NA>     - GFO_IDH_…
## 31      30        0.128   unit_7      6.70e- 1 P            <NA>     - Carbopep…
## 32      31       -0.264   unit_7      2.52e- 1 E            <NA>     - SusD-lik…
## 33      32       -0.316   unit_7      1.34e- 1 S            <NA>     - GFO_IDH_…
## 34      33       -0.950   <NA>        3.49e- 4 S            ***      - Methyltr…
## 35      34       -0.166   <NA>        8.23e- 1 S            <NA>     - DUF2795  
## 36      35       -0.634   unit_8      1.94e- 2 P            *        - PorP_SprF
## 37      36       -0.308   unit_8      3.32e- 1 M            <NA>     - [gldK]  …
## 38      37       -0.280   unit_8      5.12e- 1 S            <NA>     - [gldL]  …
## 39      38       -0.863   unit_8      2.57e- 3 S            **       - [gldM]  …
## 40      39       -0.436   unit_8      1.49e- 1 S            <NA>     - [gldN]  …
## # ℹ 4,397 more rows
## # ℹ 1 more variable: gene_name <chr>
COG_Operon_color <- c(
  "M" = "#890000",
  "K" = "#CC79A7",
  "L" = "#673AB7",
  "H" = "#000079",
  "E" = "#008080",
  "G" = "#006000",
  "C" = "#76FF03", 
  "T" = "#FFEB3B",
  "P" = "#FF7000",
  "U" = "#F15C80",
  "J" = "#9C27B0", 
  "I" = "#03A9F4", 
  "V" = "#657D9B",
  "O" = "#C5FA30",
  "F" = "#FFB000",
  "D" = "#D8BFD8", 
  "Q" = "#795548",
  "S" = "#9E9E9E",
  "Joint COG" = "#ECECEC",
  "Operon_A" = "#d55e00",
  "Operon_B" = "#1f449c"
  )

# Function to assign colors to operon_unit
assign_operon_colors <- function(operon_unit) {
  unique_units <- na.omit(unique(operon_unit))
  unit_colors <- c("Operon_A", "Operon_B")
  colors <- rep(unit_colors, length.out = length(unique_units))
  names(colors) <- unique_units
  return(colors[operon_unit])
}

# Pre-process the data
operon_lfc_df$ymin_operon <- ifelse(is.na(operon_lfc_df$operon_unit), 0, -2.1)
operon_lfc_df$ymax_operon <- ifelse(is.na(operon_lfc_df$operon_unit), 0, 2.1)

# Add operon_color column to operon_lfc_df
operon_lfc_df$operon_color <- ifelse(is.na(operon_lfc_df$operon_unit), NA, assign_operon_colors(operon_lfc_df$operon_unit))

# Verify that the operon_color column has been added correctly
print(head(operon_lfc_df))
## # A tibble: 6 × 11
##   rel_pos log2FoldChange operon_unit     padj COG_category pval_sig gene_pfam   
##     <dbl>          <dbl> <chr>          <dbl> <fct>        <chr>    <chr>       
## 1       0          -2    <NA>        3.31e-17 S            ***      <NA>        
## 2       1          -2    <NA>        1.52e-28 S            ***      <NA>        
## 3       2          -1.91 unit_1      7.78e-25 S            ***      <NA>        
## 4       3          -2    unit_1      1.60e-43 S            ***      <NA>        
## 5       4          -2    unit_2      7.19e-29 H            ***      - [ribF]  F…
## 6       5          -2    unit_2      1.94e-51 E            ***      - [argE]  M…
## # ℹ 4 more variables: gene_name <chr>, ymin_operon <dbl>, ymax_operon <dbl>,
## #   operon_color <chr>
# Now create the plot with a large width
operon_plot <- ggplot(operon_lfc_df) +
  geom_rect(aes(xmin = rel_pos + 0.2, xmax = rel_pos + 0.8, ymin = 0, ymax = log2FoldChange, fill = COG_category)) +
  scale_fill_manual(values = COG_Operon_color) +  # This will apply colors to the COG_category
  #operon wrapper shades
  geom_rect(data = operon_lfc_df, 
            aes(xmin = rel_pos, xmax = rel_pos + 1, ymin = ymin_operon, ymax = ymax_operon, fill = operon_color), 
            alpha = 0.1, color = NA) +  # Fill with operon_color, no border
  #p-value significance asterisk annotation
  geom_text(aes(x = rel_pos+0.8, y = 2.1, label = pval_sig),
            color="black", size = 0.6, 
            family = "Helvetica",fontface="bold", hjust = 0, angle=90) +
  #
  geom_text(aes(x = rel_pos+0.35, y = -2.1, label = gene_name),
            color="black", size = 0.35, 
            family = "Helvetica",fontface="italic", hjust = 0, angle=-90) +
  # For setting the dimension of the entire plot
  geom_rect(data = operon_lfc_df, 
            aes(xmin = 0, xmax = 4338, ymin = -2.8, ymax = 2.4, fill = "white"),
            alpha=0) + 
  theme_minimal() +
  theme_void()
  labs(x = "Relative Gene Position", y = "log2 Fold Change", fill = "Category")+
  egg::theme_article()
## NULL
operon_plot_pfam <- ggplot(operon_lfc_df) +
  geom_rect(aes(xmin = rel_pos + 0.2, xmax = rel_pos + 0.8, ymin = 0, ymax = log2FoldChange, fill = COG_category)) +
  scale_fill_manual(values = COG_Operon_color) +  # This will apply colors to the COG_category
  #operon wrapper shades
  geom_rect(data = operon_lfc_df, 
            aes(xmin = rel_pos, xmax = rel_pos + 1, ymin = ymin_operon, ymax = ymax_operon, fill = operon_color), 
            alpha = 0.1, color = NA) +  # Fill with operon_color, no border
  #p-value significance asterisk annotation
  geom_text(aes(x = rel_pos+0.8, y = 2.1, label = pval_sig),
            color="black", size = 0.6, 
            family = "Helvetica",fontface="bold", hjust = 0, angle=90) +
  #
  geom_text(aes(x = rel_pos+0.35, y = -2.1, label = gene_pfam),
            color="black", size = 0.35, 
            family = "Helvetica",fontface="italic", hjust = 0, angle=-90) +
  # For setting the dimension of the entire plot
  geom_rect(data = operon_lfc_df, 
            aes(xmin = 0, xmax = 4338, ymin = -18.5, ymax = 2.3, fill = "white"),
            alpha=0) + 
  theme_minimal() +
  theme_void()
  labs(x = "Relative Gene Position", y = "log2 Fold Change", fill = "Category")+
  egg::theme_article()
## NULL
operon_plot
## Warning: Removed 3214 rows containing missing values (`geom_text()`).
## Warning: Removed 3416 rows containing missing values (`geom_text()`).

# Saving the plot as an image with a large width (e.g., 2000 pixels)
ggsave("./Output_figures_n_plots/fig5_operon_wide_plot.svg", plot = operon_plot, width = 80, height = 0.3, dpi = 300, limitsize = FALSE)
## Warning: Removed 3214 rows containing missing values (`geom_text()`).
## Removed 3416 rows containing missing values (`geom_text()`).
ggsave("./Output_figures_n_plots/fig5_operon_pfam_wide_plot.svg", plot = operon_plot_pfam, width = 80, height = 1, dpi = 300, limitsize = FALSE)
## Warning: Removed 3214 rows containing missing values (`geom_text()`).
## Warning: Removed 617 rows containing missing values (`geom_text()`).
cat("
<div style='overflow-x: scroll; width: 100%;'>
  <object type='image/svg+xml' data='wide_plot.svg' width='3000' height='600'></object>
</div>
")
## 
## <div style='overflow-x: scroll; width: 100%;'>
##   <object type='image/svg+xml' data='wide_plot.svg' width='3000' height='600'></object>
## </div>
# Define the number of segments and calculate the width of each segment
n_segments <- 39
segment_width <- 150

# Create a sequence of breaks that are multiples of 50 for the entire range
total_breaks <- seq(0, 4350, by = 50)

# Loop over each segment to create and save individual plots
for (i in 1:n_segments) {
  # Define the start and end positions for the current segment
  start_pos <- (i - 1) * segment_width
  end_pos <- i * segment_width - 1  # -1 to avoid overlap
  
  # Filter the data for the current segment
  segment_df <- operon_lfc_df %>%
    filter(rel_pos >= start_pos & rel_pos < end_pos)
  
  # Find breaks that fall within the segment's range
  segment_breaks <- total_breaks[total_breaks >= start_pos & total_breaks <= end_pos]
  
  # Create the plot for the current segment without the legend
  segment_plot <- ggplot(segment_df) +
    geom_rect(aes(xmin = rel_pos + 0.2, xmax = rel_pos + 0.8, ymin = 0, ymax = log2FoldChange, fill = COG_category)) +
    scale_fill_manual(values = COG_Operon_color, guide = FALSE) +
    geom_rect(data = segment_df, 
              aes(xmin = rel_pos, xmax = rel_pos + 1, ymin = ymin_operon, ymax = ymax_operon, fill = operon_color), 
              alpha = 0.1, color = NA) +
    geom_text(aes(x = rel_pos + 0.8, y = 2.1, label = pval_sig),
              color = "black", size = 2.3, 
              family = "Helvetica", fontface = "bold", hjust = 0, angle = 90) +
    geom_text(aes(x = rel_pos + 0.35, y = -2.1, label = gene_name),
              color = "black", size = 1.4, 
              family = "Helvetica", fontface = "italic", hjust = 0, angle = -90) +
    geom_rect(data = segment_df, 
              aes(xmin = 0, xmax = 4338, ymin = -4.7, ymax = 3.3, fill = "white"),
              alpha = 0) +
    theme_minimal() +
  # theme(
  #   legend.position = "none",
  #   axis.text.x = element_text(size = 1), # Set size for x-axis text
  #   axis.text.y = element_text(size = 8), # Set size for y-axis text
  #   axis.title.x = element_text(size = 10), # Set size for x-axis title
  #   axis.title.y = element_text(size = 10)  # Set size for y-axis title
  # ) +
    labs(x = "Relative Gene Position", y = "log2 Fold Change") +
    egg::theme_article( base_size = 6) +
    # coord_fixed(ratio = 1/5) +
    scale_x_continuous(
      limits = c(start_pos, end_pos),
      breaks = segment_breaks  # Use the breaks that are within the segment's range
    )
  
  # Save the plot for the current segment
  file_name <- sprintf("./Output_figures_n_plots/fig5.1_operon_segment_%02d.svg", i)
  ggsave(file_name, plot = segment_plot, width = 7.5, height = 1.2, dpi = 300)
}
## Warning: Removed 101 rows containing missing values (`geom_text()`).
## Warning: Removed 117 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## ℹ Please use "none" instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 55 rows containing missing values (`geom_text()`).
## Warning: Removed 129 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 96 rows containing missing values (`geom_text()`).
## Warning: Removed 122 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 104 rows containing missing values (`geom_text()`).
## Warning: Removed 124 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 96 rows containing missing values (`geom_text()`).
## Warning: Removed 123 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 94 rows containing missing values (`geom_text()`).
## Warning: Removed 132 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 117 rows containing missing values (`geom_text()`).
## Removed 117 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 117 rows containing missing values (`geom_text()`).
## Warning: Removed 114 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 118 rows containing missing values (`geom_text()`).
## Warning: Removed 128 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 120 rows containing missing values (`geom_text()`).
## Warning: Removed 126 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 122 rows containing missing values (`geom_text()`).
## Warning: Removed 112 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 99 rows containing missing values (`geom_text()`).
## Warning: Removed 110 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 107 rows containing missing values (`geom_text()`).
## Removed 107 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 110 rows containing missing values (`geom_text()`).
## Warning: Removed 124 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 116 rows containing missing values (`geom_text()`).
## Warning: Removed 111 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 109 rows containing missing values (`geom_text()`).
## Warning: Removed 105 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 111 rows containing missing values (`geom_text()`).
## Warning: Removed 105 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 119 rows containing missing values (`geom_text()`).
## Warning: Removed 106 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 120 rows containing missing values (`geom_text()`).
## Warning: Removed 116 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 111 rows containing missing values (`geom_text()`).
## Warning: Removed 128 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 123 rows containing missing values (`geom_text()`).
## Warning: Removed 119 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 113 rows containing missing values (`geom_text()`).
## Warning: Removed 115 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 110 rows containing missing values (`geom_text()`).
## Warning: Removed 115 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 119 rows containing missing values (`geom_text()`).
## Warning: Removed 100 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 112 rows containing missing values (`geom_text()`).
## Warning: Removed 110 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 100 rows containing missing values (`geom_text()`).
## Warning: Removed 101 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 92 rows containing missing values (`geom_text()`).
## Warning: Removed 100 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 119 rows containing missing values (`geom_text()`).
## Warning: Removed 120 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 103 rows containing missing values (`geom_text()`).
## Warning: Removed 87 rows containing missing values (`geom_text()`).
## Warning: Removed 149 rows containing missing values (`geom_rect()`).
## Warning: Removed 60 rows containing missing values (`geom_text()`).
## Warning: Removed 69 rows containing missing values (`geom_text()`).
## Warning: Removed 87 rows containing missing values (`geom_rect()`).
## Warning: Removed 1 rows containing missing values (`geom_rect()`).
## Removed 1 rows containing missing values (`geom_rect()`).
## Removed 1 rows containing missing values (`geom_rect()`).
## Removed 1 rows containing missing values (`geom_rect()`).
## Removed 1 rows containing missing values (`geom_rect()`).
## Removed 1 rows containing missing values (`geom_rect()`).
## Removed 1 rows containing missing values (`geom_rect()`).
## Removed 1 rows containing missing values (`geom_rect()`).
## Removed 1 rows containing missing values (`geom_rect()`).